├── .clang-format
├── .github
    └── workflows
    │   ├── semgrep.yml
    │   └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── cgo.go
├── cgo_test.go
├── fixed_point.go
├── fixed_point_test.go
├── generalisation_test.go
├── go.mod
├── go.sum
├── include
    ├── Makefile
    ├── bpf
    │   ├── bpf_endian.h
    │   ├── bpf_helper_defs.h
    │   └── bpf_helpers.h
    ├── fasthash.h
    ├── in.h
    ├── ip.h
    ├── linux
    │   ├── bpf.h
    │   ├── bpf_common.h
    │   └── types.h
    ├── lookup3.h
    ├── mindef.h
    └── stdbool.h
├── rake_bpfeb.go
├── rake_bpfel.go
├── rakelimit.go
├── rakelimit_test.go
└── src
    ├── common.h
    ├── countmin.h
    ├── ewma.h
    ├── fixed-point.h
    └── rakelimit.c


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language:        Cpp
 3 | BasedOnStyle:    LLVM
 4 | AlignAfterOpenBracket: DontAlign
 5 | AlignConsecutiveAssignments: true
 6 | AlignEscapedNewlines: DontAlign
 7 | AlwaysBreakBeforeMultilineStrings: true
 8 | AlwaysBreakTemplateDeclarations: false
 9 | AllowAllParametersOfDeclarationOnNextLine: false
10 | AllowShortFunctionsOnASingleLine: false
11 | BreakBeforeBraces: Linux
12 | IndentWidth:     4
13 | KeepEmptyLinesAtTheStartOfBlocks: false
14 | TabWidth:        4
15 | UseTab:          ForContinuationAndIndentation
16 | ColumnLimit:     1000
17 | ...
18 | 


--------------------------------------------------------------------------------
/.github/workflows/semgrep.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | on:
 3 |   pull_request: {}
 4 |   workflow_dispatch: {}
 5 |   push: 
 6 |     branches:
 7 |       - main
 8 |       - master
 9 |   schedule:
10 |     - cron: '0 0 * * *'
11 | name: Semgrep config
12 | jobs:
13 |   semgrep:
14 |     name: semgrep/ci
15 |     runs-on: ubuntu-20.04
16 |     env:
17 |       SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }}
18 |       SEMGREP_URL: https://cloudflare.semgrep.dev
19 |       SEMGREP_APP_URL: https://cloudflare.semgrep.dev
20 |       SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version
21 |     container:
22 |       image: returntocorp/semgrep
23 |     steps:
24 |       - uses: actions/checkout@v3
25 |       - run: semgrep ci
26 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # Anytime we push to any branch
 2 | on: push
 3 | 
 4 | jobs:
 5 |   test:
 6 |     name: Test
 7 |     runs-on: ubuntu-20.04
 8 | 
 9 |     steps:
10 |     - name: Set up Go 1.x
11 |       uses: actions/setup-go@v2
12 |       with:
13 |         go-version: ^1.15
14 | 
15 |     - name: Check out code into the Go module directory
16 |       uses: actions/checkout@v2
17 | 
18 |     - name: Install clang
19 |       run: |
20 |         wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key 2>/dev/null | sudo apt-key add -
21 |         echo 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-12 main' | sudo tee /etc/apt/sources.list.d/clang.list
22 |         # Only update the llvm repo, this is a lot faster.
23 |         sudo apt-get update -o Dir::Etc::sourcelist="sources.list.d/clang.list" -o Dir::Etc::sourceparts="-" -o APT::Get::List-Cleanup="0"
24 |         sudo apt-get install -y --no-install-recommends clang-12
25 | 
26 |     - name: Check lint
27 |       # gofmt doesn't report any changes
28 |       run: test -z $(gofmt -l ./ | tee /dev/stderr)
29 | 
30 |     - name: Run tests
31 |       run: |
32 |         sudo sysctl -w net.core.optmem_max=22528
33 |         go test -tags cgotest -exec sudo ./...
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | *.o
 8 | bin/
 9 | deb/
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | *.json
14 | 
15 | # Output of the go coverage tool, specifically when used with LiteIDE
16 | *.out
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020, Cloudflare. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Rakelimit
 2 | 
 3 | A multi-dimensional fair-share rate limiter in BPF, designed for UDP.
 4 | The algorithm is based on Hierarchical Heavy Hitters, and ensures that no party can exceed
 5 | a certain rate of packets. For more information please take a look at our [blog post](https://blog.cloudflare.com/building-rakelimit/).
 6 | 
 7 | ## Usage
 8 | 
 9 | To activate rakelimit create a new instance and provide a file descriptor and a rate limit that you think the
10 | service in question won't be able to handle anymore:
11 | 
12 | ```go
13 | 
14 | conn, err := net.ListenPacket("udp4", "127.0.0.1:0")
15 | if err != nil {
16 |     tb.Fatal("Can't listen:", err)
17 | }
18 | udpConn := conn.(*net.UDPConn)
19 | 
20 | // We don't want to allow anyone to use more than 128 packets per second
21 | ppsPerSecond := 128
22 | rake, err := New(udpConn, ppsPerSecond)
23 | defer rake.Close()
24 | // rate limiter stays active even after closing
25 | ```
26 | 
27 | That's all! The library now enforces rate limits on incoming packets, and it happens within the kernel.
28 | 
29 | ## Requirements
30 | 
31 | The library should be go-gettable, and has been tested on Linux 5.11.
32 | 
33 | You may have to increase optmem_max depending on your distribution:
34 | 
35 | ```
36 | sudo sysctl -w net.core.optmem_max=22528
37 | ```
38 | 
39 | You will need a `clang-12` binary if you want to recompile the filter. Simply run `go generate` in the root of the project.
40 | 
41 | ## Limitations
42 | - IPv6 doesn't support options
43 | - requires tweaking of optmem
44 | - not tested in production
45 | 
46 | ## Testing
47 | 
48 | ```
49 | go test .
50 | ```
51 | 


--------------------------------------------------------------------------------
/cgo.go:
--------------------------------------------------------------------------------
 1 | // +build cgo,cgotest
 2 | 
 3 | package rakelimit
 4 | 
 5 | // #cgo CFLAGS: -Iinclude
 6 | // #include "stdlib.h"
 7 | // #include "fasthash.h"
 8 | import "C"
 9 | 
10 | func fasthash64(buf []byte) uint64 {
11 | 	ptr := C.CBytes(buf)
12 | 	defer C.free(ptr)
13 | 
14 | 	return uint64(C.fasthash64(ptr, C.__u64(len(buf)), 0))
15 | }
16 | 


--------------------------------------------------------------------------------
/cgo_test.go:
--------------------------------------------------------------------------------
 1 | // +build cgo,cgotest
 2 | 
 3 | package rakelimit
 4 | 
 5 | import (
 6 | 	"encoding/hex"
 7 | 	"testing"
 8 | )
 9 | 
10 | func TestFasthash64(t *testing.T) {
11 | 	golden := []struct {
12 | 		input []byte
13 | 		hash  uint64
14 | 	}{
15 | 		{[]byte("asdefg"), 0x07ffd15db88b150b},
16 | 		{[]byte("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."), 0xbb1655682c0ac75d},
17 | 	}
18 | 
19 | 	for _, gold := range golden {
20 | 		have := fasthash64(gold.input)
21 | 		if have != gold.hash {
22 | 			t.Logf("\n%s", hex.Dump(gold.input))
23 | 			t.Errorf("Expected hash %016x, got %016x", gold.hash, have)
24 | 		}
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/fixed_point.go:
--------------------------------------------------------------------------------
 1 | package rakelimit
 2 | 
 3 | import (
 4 | 	"math"
 5 | )
 6 | 
 7 | const fractionBits = 32
 8 | 
 9 | func floatToFixed(f float64) uint64 {
10 | 	ret := uint64(0)
11 | 	for i := 64 - fractionBits; i >= -fractionBits; i-- {
12 | 		ret = ret << 1
13 | 		if f >= math.Pow(2, float64(i)) {
14 | 			ret |= 1
15 | 			f -= math.Pow(2, float64(i))
16 | 		}
17 | 	}
18 | 	return ret
19 | }
20 | 
21 | func fixedToFloat(f uint64) float64 {
22 | 	ret := float64(0)
23 | 	for i := 64 - fractionBits - 1; i >= -fractionBits; i-- {
24 | 		if f&(1<<(i+fractionBits)) != 0 {
25 | 			ret += math.Pow(2, float64(i))
26 | 		}
27 | 	}
28 | 	return ret
29 | }
30 | 


--------------------------------------------------------------------------------
/fixed_point_test.go:
--------------------------------------------------------------------------------
 1 | package rakelimit
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestFloatToFixedPoint(t *testing.T) {
 9 | 	x := float64(1.0 / 7.0)
10 | 	y := fixedToFloat(floatToFixed(x))
11 | 	if math.Abs(y-x) > 0.000000001 {
12 | 		t.Fatal("Difference too large", x, y)
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/generalisation_test.go:
--------------------------------------------------------------------------------
  1 | package rakelimit
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"net"
  8 | 	"os"
  9 | 	"testing"
 10 | 
 11 | 	"math"
 12 | 	"math/rand"
 13 | 	"sort"
 14 | 	"time"
 15 | 
 16 | 	"github.com/google/gopacket"
 17 | 	"github.com/google/gopacket/layers"
 18 | )
 19 | 
 20 | var seed int64
 21 | 
 22 | func TestMain(m *testing.M) {
 23 | 	flag.Int64Var(&seed, "seed", 0, "seed for the random number generator")
 24 | 	flag.Parse()
 25 | 
 26 | 	if seed == 0 {
 27 | 		seed = time.Now().UnixNano()
 28 | 	}
 29 | 
 30 | 	fmt.Println("Seed is", seed)
 31 | 	os.Exit(m.Run())
 32 | }
 33 | 
 34 | type element struct {
 35 | 	SourceAddress      net.IP
 36 | 	SourcePort         int
 37 | 	DestinationAddress net.IP
 38 | 	DestinationPort    int
 39 | }
 40 | 
 41 | func (el *element) Clone() *element {
 42 | 	newEl := element{
 43 | 		SourcePort:         el.SourcePort,
 44 | 		DestinationPort:    el.DestinationPort,
 45 | 		SourceAddress:      make([]byte, len(el.SourceAddress)),
 46 | 		DestinationAddress: make([]byte, len(el.DestinationAddress)),
 47 | 	}
 48 | 
 49 | 	copy(newEl.SourceAddress, el.SourceAddress)
 50 | 	copy(newEl.DestinationAddress, el.DestinationAddress)
 51 | 
 52 | 	return &newEl
 53 | }
 54 | 
 55 | func (el *element) String() string {
 56 | 	return fmt.Sprintf("%s:%d --> %s:%d", el.SourceAddress, el.SourcePort, el.DestinationAddress, el.DestinationPort)
 57 | }
 58 | 
 59 | func (el *element) marshal() []byte {
 60 | 	var packet []gopacket.SerializableLayer
 61 | 	if len(el.SourceAddress) == net.IPv4len {
 62 | 		packet = []gopacket.SerializableLayer{
 63 | 			&layers.Ethernet{
 64 | 				SrcMAC:       []byte{1, 2, 3, 4, 5, 6},
 65 | 				DstMAC:       []byte{6, 5, 4, 3, 2, 1},
 66 | 				EthernetType: layers.EthernetTypeIPv4,
 67 | 			},
 68 | 			&layers.IPv4{
 69 | 				Version:  4,
 70 | 				SrcIP:    el.SourceAddress,
 71 | 				DstIP:    el.DestinationAddress,
 72 | 				Protocol: layers.IPProtocolUDP,
 73 | 			},
 74 | 			&layers.UDP{
 75 | 				SrcPort: layers.UDPPort(el.SourcePort),
 76 | 				DstPort: layers.UDPPort(el.DestinationPort),
 77 | 			},
 78 | 			gopacket.Payload([]byte{1, 2, 3, 4}),
 79 | 		}
 80 | 	} else {
 81 | 		packet = []gopacket.SerializableLayer{
 82 | 			&layers.Ethernet{
 83 | 				SrcMAC:       []byte{1, 2, 3, 4, 5, 6},
 84 | 				DstMAC:       []byte{6, 5, 4, 3, 2, 1},
 85 | 				EthernetType: layers.EthernetTypeIPv6,
 86 | 			},
 87 | 			&layers.IPv6{
 88 | 				Version:    6,
 89 | 				SrcIP:      el.SourceAddress,
 90 | 				DstIP:      el.DestinationAddress,
 91 | 				NextHeader: layers.IPProtocolUDP,
 92 | 			},
 93 | 			&layers.UDP{
 94 | 				SrcPort: layers.UDPPort(el.SourcePort),
 95 | 				DstPort: layers.UDPPort(el.DestinationPort),
 96 | 			},
 97 | 			gopacket.Payload([]byte{1, 2, 3, 4}),
 98 | 		}
 99 | 	}
100 | 
101 | 	buf := gopacket.NewSerializeBuffer()
102 | 	opts := gopacket.SerializeOptions{
103 | 		FixLengths: true,
104 | 	}
105 | 	gopacket.SerializeLayers(buf, opts, packet...)
106 | 	return buf.Bytes()
107 | }
108 | 
109 | type packet struct {
110 | 	received uint64
111 | 	key      string
112 | 	element
113 | }
114 | 
115 | type packetSpec struct {
116 | 	key  string
117 | 	rate int
118 | 	element
119 | }
120 | 
121 | func generatePackets(duration time.Duration, specs ...packetSpec) []packet {
122 | 	// specs describe individual streams of packets that "arrive" concurrently.
123 | 	// We need to emit packets from the specs in the correct order, determined
124 | 	// by their rate.
125 | 	type step struct {
126 | 		now uint64
127 | 		packetSpec
128 | 	}
129 | 
130 | 	var steps []step
131 | 	for _, spec := range specs {
132 | 		interval := time.Second / time.Duration(spec.rate) / time.Nanosecond
133 | 		for i := 0; i < int(duration/interval); i++ {
134 | 			steps = append(steps, step{
135 | 				uint64(i) * uint64(interval),
136 | 				spec,
137 | 			})
138 | 		}
139 | 
140 | 	}
141 | 
142 | 	sort.Slice(steps, func(i, j int) bool {
143 | 		return steps[i].now < steps[j].now
144 | 	})
145 | 
146 | 	rng := rand.New(rand.NewSource(seed))
147 | 	incompleteIP := func(ip net.IP) bool {
148 | 		return len(ip) != net.IPv4len && len(ip) != net.IPv6len
149 | 	}
150 | 
151 | 	var packets []packet
152 | 	var prev element
153 | 	for _, step := range steps {
154 | 		source := step.SourceAddress
155 | 		if incompleteIP(source) {
156 | 			source = randomIP(rng, prev.SourceAddress, source)
157 | 		}
158 | 
159 | 		sourcePort := step.SourcePort
160 | 		if sourcePort == -1 {
161 | 			sourcePort = randomPort(rng, prev.SourcePort)
162 | 		}
163 | 
164 | 		dest := step.DestinationAddress
165 | 		if incompleteIP(dest) {
166 | 			dest = randomIP(rng, prev.DestinationAddress, dest)
167 | 		}
168 | 
169 | 		destPort := step.DestinationPort
170 | 		if destPort == -1 {
171 | 			destPort = randomPort(rng, prev.DestinationPort)
172 | 		}
173 | 
174 | 		next := element{
175 | 			source, sourcePort,
176 | 			dest, destPort,
177 | 		}
178 | 
179 | 		packets = append(packets, packet{
180 | 			received: step.now,
181 | 			key:      step.key,
182 | 			element:  next,
183 | 		})
184 | 
185 | 		prev = next
186 | 	}
187 | 
188 | 	return packets
189 | }
190 | 
191 | func randomPort(rng *rand.Rand, prevPort int) int {
192 | 	port := int(rng.Intn(math.MaxUint16))
193 | 	for port == prevPort {
194 | 		port = int(rng.Intn(math.MaxUint16))
195 | 	}
196 | 	return port
197 | }
198 | 
199 | func randomIP(rng *rand.Rand, prevIP net.IP, template net.IP) net.IP {
200 | 	if len(template) == cap(template) {
201 | 		panic(fmt.Sprint("invalid template:", template))
202 | 	}
203 | 
204 | 	ip := make(net.IP, cap(template))
205 | 	copy(ip, template)
206 | 
207 | 	rand.Read(ip[len(template):])
208 | 	for bytes.Equal([]byte(prevIP), []byte(ip)) {
209 | 		rand.Read(ip[len(template):])
210 | 	}
211 | 
212 | 	return ip
213 | }
214 | 
215 | func ipTemplate(ip net.IP, ipLen int) net.IP {
216 | 	template := make(net.IP, len(ip), ipLen)
217 | 	copy(template, ip)
218 | 	return template
219 | }
220 | 
221 | func TestRate(t *testing.T) {
222 | 	const (
223 | 		duration = 10 * time.Second
224 | 		limit    = 100
225 | 	)
226 | 
227 | 	rake := mustNew(t, "127.0.0.1:0", limit)
228 | 
229 | 	packets := generatePackets(duration, packetSpec{
230 | 		rate: 2 * limit,
231 | 		element: element{
232 | 			SourceAddress:      []byte{7, 6, 5, 4},
233 | 			DestinationAddress: []byte{1, 2, 3, 4},
234 | 			SourcePort:         53,
235 | 			DestinationPort:    443,
236 | 		},
237 | 	})
238 | 
239 | 	var accepted int
240 | 	for i, packet := range packets {
241 | 		rake.updateTime(t, packet.received)
242 | 
243 | 		verdict, _, err := rake.testProgram.Test(packet.marshal())
244 | 		if err != nil {
245 | 			t.Fatal(err)
246 | 		}
247 | 
248 | 		if level := rake.rateExceededOnLevel(t); i > 0 && level != 0 {
249 | 			t.Fatalf("Packet is matched on level %d instead of 0", level)
250 | 		}
251 | 
252 | 		if verdict > 0 {
253 | 			accepted++
254 | 		}
255 | 	}
256 | 
257 | 	acceptedRate := float64(accepted) / duration.Seconds()
258 | 	if acceptedRate < limit*0.95 || acceptedRate > limit*1.05 {
259 | 		t.Errorf("Didn't match desired rate of %d: %.2f pps accepted", limit, acceptedRate)
260 | 	}
261 | }
262 | 
263 | func TestGeneralisations(t *testing.T) {
264 | 	const (
265 | 		limit = 100
266 | 	)
267 | 
268 | 	ipv6Src := net.ParseIP("1122:3344:5566:7788::aabb")
269 | 	ipv6Dst := net.ParseIP("8877:6655:4433:2211::ffee")
270 | 	srcPort := 53
271 | 	dstPort := 443
272 | 
273 | 	type testcase struct {
274 | 		level  uint32
275 | 		listen string
276 | 		element
277 | 	}
278 | 
279 | 	var generalisations []testcase
280 | 	for _, proto := range []struct {
281 | 		listen      string
282 | 		src, srcNet net.IP
283 | 		dst         net.IP
284 | 		wildcard    net.IP
285 | 	}{
286 | 		{
287 | 			"127.0.0.1:0",
288 | 			net.IP{7, 6, 5, 4}, ipTemplate(net.IP{7, 6, 5}, net.IPv4len),
289 | 			net.IP{1, 2, 3, 4},
290 | 			ipTemplate(nil, net.IPv4len),
291 | 		},
292 | 		{
293 | 			"[::1]:0",
294 | 			ipv6Src, ipv6Src[: 64/8 : net.IPv6len],
295 | 			ipv6Dst,
296 | 			ipTemplate(nil, net.IPv6len),
297 | 		},
298 | 	} {
299 | 		generalisations = append(generalisations,
300 | 			// level 0
301 | 			testcase{0, proto.listen, element{proto.src, srcPort, proto.dst, dstPort}},
302 | 
303 | 			// level 1
304 | 			testcase{1, proto.listen, element{proto.srcNet, srcPort, proto.dst, dstPort}},
305 | 			testcase{1, proto.listen, element{proto.src, -1, proto.dst, dstPort}},
306 | 			testcase{1, proto.listen, element{proto.src, srcPort, proto.dst, -1}},
307 | 
308 | 			// level 2
309 | 			testcase{2, proto.listen, element{proto.wildcard, srcPort, proto.dst, dstPort}},
310 | 			testcase{2, proto.listen, element{proto.srcNet, -1, proto.dst, dstPort}},
311 | 			testcase{2, proto.listen, element{proto.srcNet, srcPort, proto.dst, -1}},
312 | 			testcase{2, proto.listen, element{proto.src, -1, proto.dst, -1}},
313 | 
314 | 			// level 3
315 | 			testcase{3, proto.listen, element{proto.wildcard, -1, proto.dst, dstPort}},
316 | 			testcase{3, proto.listen, element{proto.wildcard, srcPort, proto.dst, -1}},
317 | 			testcase{3, proto.listen, element{proto.srcNet, -1, proto.dst, -1}},
318 | 
319 | 			// level 4
320 | 			testcase{4, proto.listen, element{proto.wildcard, -1, proto.dst, -1}},
321 | 		)
322 | 	}
323 | 
324 | 	for _, gen := range generalisations {
325 | 		t.Run(gen.String(), func(t *testing.T) {
326 | 			rake := mustNew(t, gen.listen, limit)
327 | 
328 | 			// Drop all packets once rate exceeds limit
329 | 			rake.updateRand(t, math.MaxUint32)
330 | 
331 | 			packets := generatePackets(time.Second, packetSpec{
332 | 				rate:    limit + 1,
333 | 				element: gen.element,
334 | 			})
335 | 
336 | 			for i, packet := range packets {
337 | 				rake.updateTime(t, packet.received)
338 | 
339 | 				t.Logf("%d: %s", i, &packet.element)
340 | 				verdict, _, err := rake.testProgram.Test(packet.marshal())
341 | 				if err != nil {
342 | 					t.Fatal(err)
343 | 				}
344 | 
345 | 				if i == 0 {
346 | 					if verdict == 0 {
347 | 						t.Fatal("First packet shouldn't be dropped")
348 | 					}
349 | 
350 | 					continue
351 | 				}
352 | 
353 | 				if verdict > 0 {
354 | 					t.Fatalf("Accepted packet #%d", i)
355 | 				}
356 | 
357 | 				level := rake.rateExceededOnLevel(t)
358 | 				if level != gen.level {
359 | 					t.Fatalf("Packet #%d was dropped on level %d instead of %d", i, level, gen.level)
360 | 				}
361 | 			}
362 | 		})
363 | 	}
364 | }
365 | 
366 | func TestAttackPropagation(t *testing.T) {
367 | 	const limit = 2645
368 | 
369 | 	packets := generatePackets(10*time.Second,
370 | 		packetSpec{
371 | 			key:  "attack",
372 | 			rate: 3 * limit,
373 | 			element: element{
374 | 				SourceAddress:      []byte{7, 6, 5, 4},
375 | 				DestinationAddress: []byte{1, 2, 3, 4},
376 | 				SourcePort:         53,
377 | 				DestinationPort:    443,
378 | 			},
379 | 		},
380 | 		packetSpec{
381 | 			key:  "legit",
382 | 			rate: limit / 2,
383 | 			element: element{
384 | 				SourceAddress:      []byte{7, 6, 5, 4},
385 | 				DestinationAddress: []byte{1, 2, 3, 4},
386 | 				SourcePort:         -1,
387 | 				DestinationPort:    443,
388 | 			},
389 | 		},
390 | 	)
391 | 
392 | 	rake := mustNew(t, "127.0.0.1:0", limit)
393 | 	rake.updateRand(t, math.MaxUint32)
394 | 	for i, packet := range packets {
395 | 		rake.updateTime(t, packet.received)
396 | 
397 | 		verdict, _, err := rake.testProgram.Test(packet.marshal())
398 | 		if err != nil {
399 | 			t.Fatal(err)
400 | 		}
401 | 
402 | 		if packet.key == "legit" && verdict == 0 {
403 | 			t.Fatalf("Dropped legitimate packet #%d: %v", i, rake.rateExceededOnLevel(t))
404 | 		}
405 | 	}
406 | }
407 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/cloudflare/rakelimit
 2 | 
 3 | go 1.14
 4 | 
 5 | require (
 6 | 	github.com/cilium/ebpf v0.5.1-0.20210527163130-29f67e0a7450
 7 | 	github.com/google/gopacket v1.1.18
 8 | 	golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c
 9 | )
10 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/cilium/ebpf v0.4.0 h1:QlHdikaxALkqWasW8hAC1mfR0jdmvbfaBdBPFmRSglA=
 2 | github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
 3 | github.com/cilium/ebpf v0.5.0 h1:E1KshmrMEtkMP2UjlWzfmUV1owWY+BnbL5FxxuatnrU=
 4 | github.com/cilium/ebpf v0.5.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
 5 | github.com/cilium/ebpf v0.5.1-0.20210526091824-eeeaaac5dac5 h1:XEgY0nNTP+a4vm1gYceREVgr4nTyLfBHBEk4x4TiI/Q=
 6 | github.com/cilium/ebpf v0.5.1-0.20210526091824-eeeaaac5dac5/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
 7 | github.com/cilium/ebpf v0.5.1-0.20210527163130-29f67e0a7450 h1:5xuyArKXqJdmfbPcfheMHyAswscRSBB2uJG5O0aRETA=
 8 | github.com/cilium/ebpf v0.5.1-0.20210527163130-29f67e0a7450/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
 9 | github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY=
10 | github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k=
11 | github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M=
12 | github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
13 | github.com/google/gopacket v1.1.18 h1:lum7VRA9kdlvBi7/v2p7/zcbkduHaCH/SVVyurs7OpY=
14 | github.com/google/gopacket v1.1.18/go.mod h1:UdDNZ1OO62aGYVnPhxT1U6aI7ukYtA/kB8vaU0diBUM=
15 | github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
16 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
17 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
18 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
19 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
20 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
21 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
22 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
23 | golang.org/x/sys v0.0.0-20190405154228-4b34438f7a67/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
24 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c h1:VwygUrnw9jn88c4u8GD3rZQbqrP/tgas88tPUbBxQrk=
25 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
26 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
27 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
28 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
29 | 


--------------------------------------------------------------------------------
/include/Makefile:
--------------------------------------------------------------------------------
1 | LINUX_INC ?= /usr/include
2 | LIBBPF_INC ?= /usr/local/include
3 | 
4 | .PHONY: update
5 | update:
6 | 	rsync --existing --exclude "types.h" -av "$(LINUX_INC)/linux/" "$(CURDIR)/linux"
7 | 	rsync --existing -av "$(LIBBPF_INC)/bpf/" "$(CURDIR)/bpf"
8 | 


--------------------------------------------------------------------------------
/include/bpf/bpf_endian.h:
--------------------------------------------------------------------------------
  1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
  2 | #ifndef __BPF_ENDIAN__
  3 | #define __BPF_ENDIAN__
  4 | 
  5 | /*
  6 |  * Isolate byte #n and put it into byte #m, for __u##b type.
  7 |  * E.g., moving byte #6 (nnnnnnnn) into byte #1 (mmmmmmmm) for __u64:
  8 |  * 1) xxxxxxxx nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx
  9 |  * 2) nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx 00000000
 10 |  * 3) 00000000 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn
 11 |  * 4) 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn 00000000
 12 |  */
 13 | #define ___bpf_mvb(x, b, n, m) ((__u##b)(x) << (b-(n+1)*8) >> (b-8) << (m*8))
 14 | 
 15 | #define ___bpf_swab16(x) ((__u16)(			\
 16 | 			  ___bpf_mvb(x, 16, 0, 1) |	\
 17 | 			  ___bpf_mvb(x, 16, 1, 0)))
 18 | 
 19 | #define ___bpf_swab32(x) ((__u32)(			\
 20 | 			  ___bpf_mvb(x, 32, 0, 3) |	\
 21 | 			  ___bpf_mvb(x, 32, 1, 2) |	\
 22 | 			  ___bpf_mvb(x, 32, 2, 1) |	\
 23 | 			  ___bpf_mvb(x, 32, 3, 0)))
 24 | 
 25 | #define ___bpf_swab64(x) ((__u64)(			\
 26 | 			  ___bpf_mvb(x, 64, 0, 7) |	\
 27 | 			  ___bpf_mvb(x, 64, 1, 6) |	\
 28 | 			  ___bpf_mvb(x, 64, 2, 5) |	\
 29 | 			  ___bpf_mvb(x, 64, 3, 4) |	\
 30 | 			  ___bpf_mvb(x, 64, 4, 3) |	\
 31 | 			  ___bpf_mvb(x, 64, 5, 2) |	\
 32 | 			  ___bpf_mvb(x, 64, 6, 1) |	\
 33 | 			  ___bpf_mvb(x, 64, 7, 0)))
 34 | 
 35 | /* LLVM's BPF target selects the endianness of the CPU
 36 |  * it compiles on, or the user specifies (bpfel/bpfeb),
 37 |  * respectively. The used __BYTE_ORDER__ is defined by
 38 |  * the compiler, we cannot rely on __BYTE_ORDER from
 39 |  * libc headers, since it doesn't reflect the actual
 40 |  * requested byte order.
 41 |  *
 42 |  * Note, LLVM's BPF target has different __builtin_bswapX()
 43 |  * semantics. It does map to BPF_ALU | BPF_END | BPF_TO_BE
 44 |  * in bpfel and bpfeb case, which means below, that we map
 45 |  * to cpu_to_be16(). We could use it unconditionally in BPF
 46 |  * case, but better not rely on it, so that this header here
 47 |  * can be used from application and BPF program side, which
 48 |  * use different targets.
 49 |  */
 50 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 51 | # define __bpf_ntohs(x)			__builtin_bswap16(x)
 52 | # define __bpf_htons(x)			__builtin_bswap16(x)
 53 | # define __bpf_constant_ntohs(x)	___bpf_swab16(x)
 54 | # define __bpf_constant_htons(x)	___bpf_swab16(x)
 55 | # define __bpf_ntohl(x)			__builtin_bswap32(x)
 56 | # define __bpf_htonl(x)			__builtin_bswap32(x)
 57 | # define __bpf_constant_ntohl(x)	___bpf_swab32(x)
 58 | # define __bpf_constant_htonl(x)	___bpf_swab32(x)
 59 | # define __bpf_be64_to_cpu(x)		__builtin_bswap64(x)
 60 | # define __bpf_cpu_to_be64(x)		__builtin_bswap64(x)
 61 | # define __bpf_constant_be64_to_cpu(x)	___bpf_swab64(x)
 62 | # define __bpf_constant_cpu_to_be64(x)	___bpf_swab64(x)
 63 | #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 64 | # define __bpf_ntohs(x)			(x)
 65 | # define __bpf_htons(x)			(x)
 66 | # define __bpf_constant_ntohs(x)	(x)
 67 | # define __bpf_constant_htons(x)	(x)
 68 | # define __bpf_ntohl(x)			(x)
 69 | # define __bpf_htonl(x)			(x)
 70 | # define __bpf_constant_ntohl(x)	(x)
 71 | # define __bpf_constant_htonl(x)	(x)
 72 | # define __bpf_be64_to_cpu(x)		(x)
 73 | # define __bpf_cpu_to_be64(x)		(x)
 74 | # define __bpf_constant_be64_to_cpu(x)  (x)
 75 | # define __bpf_constant_cpu_to_be64(x)  (x)
 76 | #else
 77 | # error "Fix your compiler's __BYTE_ORDER__?!"
 78 | #endif
 79 | 
 80 | #define bpf_htons(x)				\
 81 | 	(__builtin_constant_p(x) ?		\
 82 | 	 __bpf_constant_htons(x) : __bpf_htons(x))
 83 | #define bpf_ntohs(x)				\
 84 | 	(__builtin_constant_p(x) ?		\
 85 | 	 __bpf_constant_ntohs(x) : __bpf_ntohs(x))
 86 | #define bpf_htonl(x)				\
 87 | 	(__builtin_constant_p(x) ?		\
 88 | 	 __bpf_constant_htonl(x) : __bpf_htonl(x))
 89 | #define bpf_ntohl(x)				\
 90 | 	(__builtin_constant_p(x) ?		\
 91 | 	 __bpf_constant_ntohl(x) : __bpf_ntohl(x))
 92 | #define bpf_cpu_to_be64(x)			\
 93 | 	(__builtin_constant_p(x) ?		\
 94 | 	 __bpf_constant_cpu_to_be64(x) : __bpf_cpu_to_be64(x))
 95 | #define bpf_be64_to_cpu(x)			\
 96 | 	(__builtin_constant_p(x) ?		\
 97 | 	 __bpf_constant_be64_to_cpu(x) : __bpf_be64_to_cpu(x))
 98 | 
 99 | #endif /* __BPF_ENDIAN__ */
100 | 


--------------------------------------------------------------------------------
/include/bpf/bpf_helper_defs.h:
--------------------------------------------------------------------------------
   1 | /* This is auto-generated file. See bpf_helpers_doc.py for details. */
   2 | 
   3 | /* Forward declarations of BPF structs */
   4 | struct bpf_fib_lookup;
   5 | struct bpf_sk_lookup;
   6 | struct bpf_perf_event_data;
   7 | struct bpf_perf_event_value;
   8 | struct bpf_pidns_info;
   9 | struct bpf_sock;
  10 | struct bpf_sock_addr;
  11 | struct bpf_sock_ops;
  12 | struct bpf_sock_tuple;
  13 | struct bpf_spin_lock;
  14 | struct bpf_sysctl;
  15 | struct bpf_tcp_sock;
  16 | struct bpf_tunnel_key;
  17 | struct bpf_xfrm_state;
  18 | struct pt_regs;
  19 | struct sk_reuseport_md;
  20 | struct sockaddr;
  21 | struct tcphdr;
  22 | struct seq_file;
  23 | struct tcp6_sock;
  24 | struct tcp_sock;
  25 | struct tcp_timewait_sock;
  26 | struct tcp_request_sock;
  27 | struct udp6_sock;
  28 | struct task_struct;
  29 | struct __sk_buff;
  30 | struct sk_msg_md;
  31 | struct xdp_md;
  32 | 
  33 | /*
  34 |  * bpf_map_lookup_elem
  35 |  *
  36 |  * 	Perform a lookup in *map* for an entry associated to *key*.
  37 |  *
  38 |  * Returns
  39 |  * 	Map value associated to *key*, or **NULL** if no entry was
  40 |  * 	found.
  41 |  */
  42 | static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1;
  43 | 
  44 | /*
  45 |  * bpf_map_update_elem
  46 |  *
  47 |  * 	Add or update the value of the entry associated to *key* in
  48 |  * 	*map* with *value*. *flags* is one of:
  49 |  *
  50 |  * 	**BPF_NOEXIST**
  51 |  * 		The entry for *key* must not exist in the map.
  52 |  * 	**BPF_EXIST**
  53 |  * 		The entry for *key* must already exist in the map.
  54 |  * 	**BPF_ANY**
  55 |  * 		No condition on the existence of the entry for *key*.
  56 |  *
  57 |  * 	Flag value **BPF_NOEXIST** cannot be used for maps of types
  58 |  * 	**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
  59 |  * 	elements always exist), the helper would return an error.
  60 |  *
  61 |  * Returns
  62 |  * 	0 on success, or a negative error in case of failure.
  63 |  */
  64 | static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2;
  65 | 
  66 | /*
  67 |  * bpf_map_delete_elem
  68 |  *
  69 |  * 	Delete entry with *key* from *map*.
  70 |  *
  71 |  * Returns
  72 |  * 	0 on success, or a negative error in case of failure.
  73 |  */
  74 | static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3;
  75 | 
  76 | /*
  77 |  * bpf_probe_read
  78 |  *
  79 |  * 	For tracing programs, safely attempt to read *size* bytes from
  80 |  * 	kernel space address *unsafe_ptr* and store the data in *dst*.
  81 |  *
  82 |  * 	Generally, use **bpf_probe_read_user**\ () or
  83 |  * 	**bpf_probe_read_kernel**\ () instead.
  84 |  *
  85 |  * Returns
  86 |  * 	0 on success, or a negative error in case of failure.
  87 |  */
  88 | static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4;
  89 | 
  90 | /*
  91 |  * bpf_ktime_get_ns
  92 |  *
  93 |  * 	Return the time elapsed since system boot, in nanoseconds.
  94 |  * 	Does not include time the system was suspended.
  95 |  * 	See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
  96 |  *
  97 |  * Returns
  98 |  * 	Current *ktime*.
  99 |  */
 100 | static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5;
 101 | 
 102 | /*
 103 |  * bpf_trace_printk
 104 |  *
 105 |  * 	This helper is a "printk()-like" facility for debugging. It
 106 |  * 	prints a message defined by format *fmt* (of size *fmt_size*)
 107 |  * 	to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
 108 |  * 	available. It can take up to three additional **u64**
 109 |  * 	arguments (as an eBPF helpers, the total number of arguments is
 110 |  * 	limited to five).
 111 |  *
 112 |  * 	Each time the helper is called, it appends a line to the trace.
 113 |  * 	Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
 114 |  * 	open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
 115 |  * 	The format of the trace is customizable, and the exact output
 116 |  * 	one will get depends on the options set in
 117 |  * 	*\/sys/kernel/debug/tracing/trace_options* (see also the
 118 |  * 	*README* file under the same directory). However, it usually
 119 |  * 	defaults to something like:
 120 |  *
 121 |  * 	::
 122 |  *
 123 |  * 		telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
 124 |  *
 125 |  * 	In the above:
 126 |  *
 127 |  * 		* ``telnet`` is the name of the current task.
 128 |  * 		* ``470`` is the PID of the current task.
 129 |  * 		* ``001`` is the CPU number on which the task is
 130 |  * 		  running.
 131 |  * 		* In ``.N..``, each character refers to a set of
 132 |  * 		  options (whether irqs are enabled, scheduling
 133 |  * 		  options, whether hard/softirqs are running, level of
 134 |  * 		  preempt_disabled respectively). **N** means that
 135 |  * 		  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
 136 |  * 		  are set.
 137 |  * 		* ``419421.045894`` is a timestamp.
 138 |  * 		* ``0x00000001`` is a fake value used by BPF for the
 139 |  * 		  instruction pointer register.
 140 |  * 		* ``<formatted msg>`` is the message formatted with
 141 |  * 		  *fmt*.
 142 |  *
 143 |  * 	The conversion specifiers supported by *fmt* are similar, but
 144 |  * 	more limited than for printk(). They are **%d**, **%i**,
 145 |  * 	**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
 146 |  * 	**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
 147 |  * 	of field, padding with zeroes, etc.) is available, and the
 148 |  * 	helper will return **-EINVAL** (but print nothing) if it
 149 |  * 	encounters an unknown specifier.
 150 |  *
 151 |  * 	Also, note that **bpf_trace_printk**\ () is slow, and should
 152 |  * 	only be used for debugging purposes. For this reason, a notice
 153 |  * 	bloc (spanning several lines) is printed to kernel logs and
 154 |  * 	states that the helper should not be used "for production use"
 155 |  * 	the first time this helper is used (or more precisely, when
 156 |  * 	**trace_printk**\ () buffers are allocated). For passing values
 157 |  * 	to user space, perf events should be preferred.
 158 |  *
 159 |  * Returns
 160 |  * 	The number of bytes written to the buffer, or a negative error
 161 |  * 	in case of failure.
 162 |  */
 163 | static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6;
 164 | 
 165 | /*
 166 |  * bpf_get_prandom_u32
 167 |  *
 168 |  * 	Get a pseudo-random number.
 169 |  *
 170 |  * 	From a security point of view, this helper uses its own
 171 |  * 	pseudo-random internal state, and cannot be used to infer the
 172 |  * 	seed of other random functions in the kernel. However, it is
 173 |  * 	essential to note that the generator used by the helper is not
 174 |  * 	cryptographically secure.
 175 |  *
 176 |  * Returns
 177 |  * 	A random 32-bit unsigned value.
 178 |  */
 179 | static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7;
 180 | 
 181 | /*
 182 |  * bpf_get_smp_processor_id
 183 |  *
 184 |  * 	Get the SMP (symmetric multiprocessing) processor id. Note that
 185 |  * 	all programs run with preemption disabled, which means that the
 186 |  * 	SMP processor id is stable during all the execution of the
 187 |  * 	program.
 188 |  *
 189 |  * Returns
 190 |  * 	The SMP id of the processor running the program.
 191 |  */
 192 | static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8;
 193 | 
 194 | /*
 195 |  * bpf_skb_store_bytes
 196 |  *
 197 |  * 	Store *len* bytes from address *from* into the packet
 198 |  * 	associated to *skb*, at *offset*. *flags* are a combination of
 199 |  * 	**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
 200 |  * 	checksum for the packet after storing the bytes) and
 201 |  * 	**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
 202 |  * 	**->swhash** and *skb*\ **->l4hash** to 0).
 203 |  *
 204 |  * 	A call to this helper is susceptible to change the underlying
 205 |  * 	packet buffer. Therefore, at load time, all checks on pointers
 206 |  * 	previously done by the verifier are invalidated and must be
 207 |  * 	performed again, if the helper is used in combination with
 208 |  * 	direct packet access.
 209 |  *
 210 |  * Returns
 211 |  * 	0 on success, or a negative error in case of failure.
 212 |  */
 213 | static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9;
 214 | 
 215 | /*
 216 |  * bpf_l3_csum_replace
 217 |  *
 218 |  * 	Recompute the layer 3 (e.g. IP) checksum for the packet
 219 |  * 	associated to *skb*. Computation is incremental, so the helper
 220 |  * 	must know the former value of the header field that was
 221 |  * 	modified (*from*), the new value of this field (*to*), and the
 222 |  * 	number of bytes (2 or 4) for this field, stored in *size*.
 223 |  * 	Alternatively, it is possible to store the difference between
 224 |  * 	the previous and the new values of the header field in *to*, by
 225 |  * 	setting *from* and *size* to 0. For both methods, *offset*
 226 |  * 	indicates the location of the IP checksum within the packet.
 227 |  *
 228 |  * 	This helper works in combination with **bpf_csum_diff**\ (),
 229 |  * 	which does not update the checksum in-place, but offers more
 230 |  * 	flexibility and can handle sizes larger than 2 or 4 for the
 231 |  * 	checksum to update.
 232 |  *
 233 |  * 	A call to this helper is susceptible to change the underlying
 234 |  * 	packet buffer. Therefore, at load time, all checks on pointers
 235 |  * 	previously done by the verifier are invalidated and must be
 236 |  * 	performed again, if the helper is used in combination with
 237 |  * 	direct packet access.
 238 |  *
 239 |  * Returns
 240 |  * 	0 on success, or a negative error in case of failure.
 241 |  */
 242 | static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10;
 243 | 
 244 | /*
 245 |  * bpf_l4_csum_replace
 246 |  *
 247 |  * 	Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
 248 |  * 	packet associated to *skb*. Computation is incremental, so the
 249 |  * 	helper must know the former value of the header field that was
 250 |  * 	modified (*from*), the new value of this field (*to*), and the
 251 |  * 	number of bytes (2 or 4) for this field, stored on the lowest
 252 |  * 	four bits of *flags*. Alternatively, it is possible to store
 253 |  * 	the difference between the previous and the new values of the
 254 |  * 	header field in *to*, by setting *from* and the four lowest
 255 |  * 	bits of *flags* to 0. For both methods, *offset* indicates the
 256 |  * 	location of the IP checksum within the packet. In addition to
 257 |  * 	the size of the field, *flags* can be added (bitwise OR) actual
 258 |  * 	flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
 259 |  * 	untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
 260 |  * 	for updates resulting in a null checksum the value is set to
 261 |  * 	**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
 262 |  * 	the checksum is to be computed against a pseudo-header.
 263 |  *
 264 |  * 	This helper works in combination with **bpf_csum_diff**\ (),
 265 |  * 	which does not update the checksum in-place, but offers more
 266 |  * 	flexibility and can handle sizes larger than 2 or 4 for the
 267 |  * 	checksum to update.
 268 |  *
 269 |  * 	A call to this helper is susceptible to change the underlying
 270 |  * 	packet buffer. Therefore, at load time, all checks on pointers
 271 |  * 	previously done by the verifier are invalidated and must be
 272 |  * 	performed again, if the helper is used in combination with
 273 |  * 	direct packet access.
 274 |  *
 275 |  * Returns
 276 |  * 	0 on success, or a negative error in case of failure.
 277 |  */
 278 | static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11;
 279 | 
 280 | /*
 281 |  * bpf_tail_call
 282 |  *
 283 |  * 	This special helper is used to trigger a "tail call", or in
 284 |  * 	other words, to jump into another eBPF program. The same stack
 285 |  * 	frame is used (but values on stack and in registers for the
 286 |  * 	caller are not accessible to the callee). This mechanism allows
 287 |  * 	for program chaining, either for raising the maximum number of
 288 |  * 	available eBPF instructions, or to execute given programs in
 289 |  * 	conditional blocks. For security reasons, there is an upper
 290 |  * 	limit to the number of successive tail calls that can be
 291 |  * 	performed.
 292 |  *
 293 |  * 	Upon call of this helper, the program attempts to jump into a
 294 |  * 	program referenced at index *index* in *prog_array_map*, a
 295 |  * 	special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
 296 |  * 	*ctx*, a pointer to the context.
 297 |  *
 298 |  * 	If the call succeeds, the kernel immediately runs the first
 299 |  * 	instruction of the new program. This is not a function call,
 300 |  * 	and it never returns to the previous program. If the call
 301 |  * 	fails, then the helper has no effect, and the caller continues
 302 |  * 	to run its subsequent instructions. A call can fail if the
 303 |  * 	destination program for the jump does not exist (i.e. *index*
 304 |  * 	is superior to the number of entries in *prog_array_map*), or
 305 |  * 	if the maximum number of tail calls has been reached for this
 306 |  * 	chain of programs. This limit is defined in the kernel by the
 307 |  * 	macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
 308 |  * 	which is currently set to 32.
 309 |  *
 310 |  * Returns
 311 |  * 	0 on success, or a negative error in case of failure.
 312 |  */
 313 | static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12;
 314 | 
 315 | /*
 316 |  * bpf_clone_redirect
 317 |  *
 318 |  * 	Clone and redirect the packet associated to *skb* to another
 319 |  * 	net device of index *ifindex*. Both ingress and egress
 320 |  * 	interfaces can be used for redirection. The **BPF_F_INGRESS**
 321 |  * 	value in *flags* is used to make the distinction (ingress path
 322 |  * 	is selected if the flag is present, egress path otherwise).
 323 |  * 	This is the only flag supported for now.
 324 |  *
 325 |  * 	In comparison with **bpf_redirect**\ () helper,
 326 |  * 	**bpf_clone_redirect**\ () has the associated cost of
 327 |  * 	duplicating the packet buffer, but this can be executed out of
 328 |  * 	the eBPF program. Conversely, **bpf_redirect**\ () is more
 329 |  * 	efficient, but it is handled through an action code where the
 330 |  * 	redirection happens only after the eBPF program has returned.
 331 |  *
 332 |  * 	A call to this helper is susceptible to change the underlying
 333 |  * 	packet buffer. Therefore, at load time, all checks on pointers
 334 |  * 	previously done by the verifier are invalidated and must be
 335 |  * 	performed again, if the helper is used in combination with
 336 |  * 	direct packet access.
 337 |  *
 338 |  * Returns
 339 |  * 	0 on success, or a negative error in case of failure.
 340 |  */
 341 | static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13;
 342 | 
 343 | /*
 344 |  * bpf_get_current_pid_tgid
 345 |  *
 346 |  *
 347 |  * Returns
 348 |  * 	A 64-bit integer containing the current tgid and pid, and
 349 |  * 	created as such:
 350 |  * 	*current_task*\ **->tgid << 32 \|**
 351 |  * 	*current_task*\ **->pid**.
 352 |  */
 353 | static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14;
 354 | 
 355 | /*
 356 |  * bpf_get_current_uid_gid
 357 |  *
 358 |  *
 359 |  * Returns
 360 |  * 	A 64-bit integer containing the current GID and UID, and
 361 |  * 	created as such: *current_gid* **<< 32 \|** *current_uid*.
 362 |  */
 363 | static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15;
 364 | 
 365 | /*
 366 |  * bpf_get_current_comm
 367 |  *
 368 |  * 	Copy the **comm** attribute of the current task into *buf* of
 369 |  * 	*size_of_buf*. The **comm** attribute contains the name of
 370 |  * 	the executable (excluding the path) for the current task. The
 371 |  * 	*size_of_buf* must be strictly positive. On success, the
 372 |  * 	helper makes sure that the *buf* is NUL-terminated. On failure,
 373 |  * 	it is filled with zeroes.
 374 |  *
 375 |  * Returns
 376 |  * 	0 on success, or a negative error in case of failure.
 377 |  */
 378 | static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16;
 379 | 
 380 | /*
 381 |  * bpf_get_cgroup_classid
 382 |  *
 383 |  * 	Retrieve the classid for the current task, i.e. for the net_cls
 384 |  * 	cgroup to which *skb* belongs.
 385 |  *
 386 |  * 	This helper can be used on TC egress path, but not on ingress.
 387 |  *
 388 |  * 	The net_cls cgroup provides an interface to tag network packets
 389 |  * 	based on a user-provided identifier for all traffic coming from
 390 |  * 	the tasks belonging to the related cgroup. See also the related
 391 |  * 	kernel documentation, available from the Linux sources in file
 392 |  * 	*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
 393 |  *
 394 |  * 	The Linux kernel has two versions for cgroups: there are
 395 |  * 	cgroups v1 and cgroups v2. Both are available to users, who can
 396 |  * 	use a mixture of them, but note that the net_cls cgroup is for
 397 |  * 	cgroup v1 only. This makes it incompatible with BPF programs
 398 |  * 	run on cgroups, which is a cgroup-v2-only feature (a socket can
 399 |  * 	only hold data for one version of cgroups at a time).
 400 |  *
 401 |  * 	This helper is only available is the kernel was compiled with
 402 |  * 	the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
 403 |  * 	"**y**" or to "**m**".
 404 |  *
 405 |  * Returns
 406 |  * 	The classid, or 0 for the default unconfigured classid.
 407 |  */
 408 | static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17;
 409 | 
 410 | /*
 411 |  * bpf_skb_vlan_push
 412 |  *
 413 |  * 	Push a *vlan_tci* (VLAN tag control information) of protocol
 414 |  * 	*vlan_proto* to the packet associated to *skb*, then update
 415 |  * 	the checksum. Note that if *vlan_proto* is different from
 416 |  * 	**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
 417 |  * 	be **ETH_P_8021Q**.
 418 |  *
 419 |  * 	A call to this helper is susceptible to change the underlying
 420 |  * 	packet buffer. Therefore, at load time, all checks on pointers
 421 |  * 	previously done by the verifier are invalidated and must be
 422 |  * 	performed again, if the helper is used in combination with
 423 |  * 	direct packet access.
 424 |  *
 425 |  * Returns
 426 |  * 	0 on success, or a negative error in case of failure.
 427 |  */
 428 | static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18;
 429 | 
 430 | /*
 431 |  * bpf_skb_vlan_pop
 432 |  *
 433 |  * 	Pop a VLAN header from the packet associated to *skb*.
 434 |  *
 435 |  * 	A call to this helper is susceptible to change the underlying
 436 |  * 	packet buffer. Therefore, at load time, all checks on pointers
 437 |  * 	previously done by the verifier are invalidated and must be
 438 |  * 	performed again, if the helper is used in combination with
 439 |  * 	direct packet access.
 440 |  *
 441 |  * Returns
 442 |  * 	0 on success, or a negative error in case of failure.
 443 |  */
 444 | static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19;
 445 | 
 446 | /*
 447 |  * bpf_skb_get_tunnel_key
 448 |  *
 449 |  * 	Get tunnel metadata. This helper takes a pointer *key* to an
 450 |  * 	empty **struct bpf_tunnel_key** of **size**, that will be
 451 |  * 	filled with tunnel metadata for the packet associated to *skb*.
 452 |  * 	The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
 453 |  * 	indicates that the tunnel is based on IPv6 protocol instead of
 454 |  * 	IPv4.
 455 |  *
 456 |  * 	The **struct bpf_tunnel_key** is an object that generalizes the
 457 |  * 	principal parameters used by various tunneling protocols into a
 458 |  * 	single struct. This way, it can be used to easily make a
 459 |  * 	decision based on the contents of the encapsulation header,
 460 |  * 	"summarized" in this struct. In particular, it holds the IP
 461 |  * 	address of the remote end (IPv4 or IPv6, depending on the case)
 462 |  * 	in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
 463 |  * 	this struct exposes the *key*\ **->tunnel_id**, which is
 464 |  * 	generally mapped to a VNI (Virtual Network Identifier), making
 465 |  * 	it programmable together with the **bpf_skb_set_tunnel_key**\
 466 |  * 	() helper.
 467 |  *
 468 |  * 	Let's imagine that the following code is part of a program
 469 |  * 	attached to the TC ingress interface, on one end of a GRE
 470 |  * 	tunnel, and is supposed to filter out all messages coming from
 471 |  * 	remote ends with IPv4 address other than 10.0.0.1:
 472 |  *
 473 |  * 	::
 474 |  *
 475 |  * 		int ret;
 476 |  * 		struct bpf_tunnel_key key = {};
 477 |  * 		
 478 |  * 		ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
 479 |  * 		if (ret < 0)
 480 |  * 			return TC_ACT_SHOT;	// drop packet
 481 |  * 		
 482 |  * 		if (key.remote_ipv4 != 0x0a000001)
 483 |  * 			return TC_ACT_SHOT;	// drop packet
 484 |  * 		
 485 |  * 		return TC_ACT_OK;		// accept packet
 486 |  *
 487 |  * 	This interface can also be used with all encapsulation devices
 488 |  * 	that can operate in "collect metadata" mode: instead of having
 489 |  * 	one network device per specific configuration, the "collect
 490 |  * 	metadata" mode only requires a single device where the
 491 |  * 	configuration can be extracted from this helper.
 492 |  *
 493 |  * 	This can be used together with various tunnels such as VXLan,
 494 |  * 	Geneve, GRE or IP in IP (IPIP).
 495 |  *
 496 |  * Returns
 497 |  * 	0 on success, or a negative error in case of failure.
 498 |  */
 499 | static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20;
 500 | 
 501 | /*
 502 |  * bpf_skb_set_tunnel_key
 503 |  *
 504 |  * 	Populate tunnel metadata for packet associated to *skb.* The
 505 |  * 	tunnel metadata is set to the contents of *key*, of *size*. The
 506 |  * 	*flags* can be set to a combination of the following values:
 507 |  *
 508 |  * 	**BPF_F_TUNINFO_IPV6**
 509 |  * 		Indicate that the tunnel is based on IPv6 protocol
 510 |  * 		instead of IPv4.
 511 |  * 	**BPF_F_ZERO_CSUM_TX**
 512 |  * 		For IPv4 packets, add a flag to tunnel metadata
 513 |  * 		indicating that checksum computation should be skipped
 514 |  * 		and checksum set to zeroes.
 515 |  * 	**BPF_F_DONT_FRAGMENT**
 516 |  * 		Add a flag to tunnel metadata indicating that the
 517 |  * 		packet should not be fragmented.
 518 |  * 	**BPF_F_SEQ_NUMBER**
 519 |  * 		Add a flag to tunnel metadata indicating that a
 520 |  * 		sequence number should be added to tunnel header before
 521 |  * 		sending the packet. This flag was added for GRE
 522 |  * 		encapsulation, but might be used with other protocols
 523 |  * 		as well in the future.
 524 |  *
 525 |  * 	Here is a typical usage on the transmit path:
 526 |  *
 527 |  * 	::
 528 |  *
 529 |  * 		struct bpf_tunnel_key key;
 530 |  * 		     populate key ...
 531 |  * 		bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
 532 |  * 		bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
 533 |  *
 534 |  * 	See also the description of the **bpf_skb_get_tunnel_key**\ ()
 535 |  * 	helper for additional information.
 536 |  *
 537 |  * Returns
 538 |  * 	0 on success, or a negative error in case of failure.
 539 |  */
 540 | static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21;
 541 | 
 542 | /*
 543 |  * bpf_perf_event_read
 544 |  *
 545 |  * 	Read the value of a perf event counter. This helper relies on a
 546 |  * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
 547 |  * 	the perf event counter is selected when *map* is updated with
 548 |  * 	perf event file descriptors. The *map* is an array whose size
 549 |  * 	is the number of available CPUs, and each cell contains a value
 550 |  * 	relative to one CPU. The value to retrieve is indicated by
 551 |  * 	*flags*, that contains the index of the CPU to look up, masked
 552 |  * 	with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
 553 |  * 	**BPF_F_CURRENT_CPU** to indicate that the value for the
 554 |  * 	current CPU should be retrieved.
 555 |  *
 556 |  * 	Note that before Linux 4.13, only hardware perf event can be
 557 |  * 	retrieved.
 558 |  *
 559 |  * 	Also, be aware that the newer helper
 560 |  * 	**bpf_perf_event_read_value**\ () is recommended over
 561 |  * 	**bpf_perf_event_read**\ () in general. The latter has some ABI
 562 |  * 	quirks where error and counter value are used as a return code
 563 |  * 	(which is wrong to do since ranges may overlap). This issue is
 564 |  * 	fixed with **bpf_perf_event_read_value**\ (), which at the same
 565 |  * 	time provides more features over the **bpf_perf_event_read**\
 566 |  * 	() interface. Please refer to the description of
 567 |  * 	**bpf_perf_event_read_value**\ () for details.
 568 |  *
 569 |  * Returns
 570 |  * 	The value of the perf event counter read from the map, or a
 571 |  * 	negative error code in case of failure.
 572 |  */
 573 | static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22;
 574 | 
 575 | /*
 576 |  * bpf_redirect
 577 |  *
 578 |  * 	Redirect the packet to another net device of index *ifindex*.
 579 |  * 	This helper is somewhat similar to **bpf_clone_redirect**\
 580 |  * 	(), except that the packet is not cloned, which provides
 581 |  * 	increased performance.
 582 |  *
 583 |  * 	Except for XDP, both ingress and egress interfaces can be used
 584 |  * 	for redirection. The **BPF_F_INGRESS** value in *flags* is used
 585 |  * 	to make the distinction (ingress path is selected if the flag
 586 |  * 	is present, egress path otherwise). Currently, XDP only
 587 |  * 	supports redirection to the egress interface, and accepts no
 588 |  * 	flag at all.
 589 |  *
 590 |  * 	The same effect can also be attained with the more generic
 591 |  * 	**bpf_redirect_map**\ (), which uses a BPF map to store the
 592 |  * 	redirect target instead of providing it directly to the helper.
 593 |  *
 594 |  * Returns
 595 |  * 	For XDP, the helper returns **XDP_REDIRECT** on success or
 596 |  * 	**XDP_ABORTED** on error. For other program types, the values
 597 |  * 	are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
 598 |  * 	error.
 599 |  */
 600 | static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23;
 601 | 
 602 | /*
 603 |  * bpf_get_route_realm
 604 |  *
 605 |  * 	Retrieve the realm or the route, that is to say the
 606 |  * 	**tclassid** field of the destination for the *skb*. The
 607 |  * 	indentifier retrieved is a user-provided tag, similar to the
 608 |  * 	one used with the net_cls cgroup (see description for
 609 |  * 	**bpf_get_cgroup_classid**\ () helper), but here this tag is
 610 |  * 	held by a route (a destination entry), not by a task.
 611 |  *
 612 |  * 	Retrieving this identifier works with the clsact TC egress hook
 613 |  * 	(see also **tc-bpf(8)**), or alternatively on conventional
 614 |  * 	classful egress qdiscs, but not on TC ingress path. In case of
 615 |  * 	clsact TC egress hook, this has the advantage that, internally,
 616 |  * 	the destination entry has not been dropped yet in the transmit
 617 |  * 	path. Therefore, the destination entry does not need to be
 618 |  * 	artificially held via **netif_keep_dst**\ () for a classful
 619 |  * 	qdisc until the *skb* is freed.
 620 |  *
 621 |  * 	This helper is available only if the kernel was compiled with
 622 |  * 	**CONFIG_IP_ROUTE_CLASSID** configuration option.
 623 |  *
 624 |  * Returns
 625 |  * 	The realm of the route for the packet associated to *skb*, or 0
 626 |  * 	if none was found.
 627 |  */
 628 | static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24;
 629 | 
 630 | /*
 631 |  * bpf_perf_event_output
 632 |  *
 633 |  * 	Write raw *data* blob into a special BPF perf event held by
 634 |  * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
 635 |  * 	event must have the following attributes: **PERF_SAMPLE_RAW**
 636 |  * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
 637 |  * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
 638 |  *
 639 |  * 	The *flags* are used to indicate the index in *map* for which
 640 |  * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
 641 |  * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
 642 |  * 	to indicate that the index of the current CPU core should be
 643 |  * 	used.
 644 |  *
 645 |  * 	The value to write, of *size*, is passed through eBPF stack and
 646 |  * 	pointed by *data*.
 647 |  *
 648 |  * 	The context of the program *ctx* needs also be passed to the
 649 |  * 	helper.
 650 |  *
 651 |  * 	On user space, a program willing to read the values needs to
 652 |  * 	call **perf_event_open**\ () on the perf event (either for
 653 |  * 	one or for all CPUs) and to store the file descriptor into the
 654 |  * 	*map*. This must be done before the eBPF program can send data
 655 |  * 	into it. An example is available in file
 656 |  * 	*samples/bpf/trace_output_user.c* in the Linux kernel source
 657 |  * 	tree (the eBPF program counterpart is in
 658 |  * 	*samples/bpf/trace_output_kern.c*).
 659 |  *
 660 |  * 	**bpf_perf_event_output**\ () achieves better performance
 661 |  * 	than **bpf_trace_printk**\ () for sharing data with user
 662 |  * 	space, and is much better suitable for streaming data from eBPF
 663 |  * 	programs.
 664 |  *
 665 |  * 	Note that this helper is not restricted to tracing use cases
 666 |  * 	and can be used with programs attached to TC or XDP as well,
 667 |  * 	where it allows for passing data to user space listeners. Data
 668 |  * 	can be:
 669 |  *
 670 |  * 	* Only custom structs,
 671 |  * 	* Only the packet payload, or
 672 |  * 	* A combination of both.
 673 |  *
 674 |  * Returns
 675 |  * 	0 on success, or a negative error in case of failure.
 676 |  */
 677 | static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25;
 678 | 
 679 | /*
 680 |  * bpf_skb_load_bytes
 681 |  *
 682 |  * 	This helper was provided as an easy way to load data from a
 683 |  * 	packet. It can be used to load *len* bytes from *offset* from
 684 |  * 	the packet associated to *skb*, into the buffer pointed by
 685 |  * 	*to*.
 686 |  *
 687 |  * 	Since Linux 4.7, usage of this helper has mostly been replaced
 688 |  * 	by "direct packet access", enabling packet data to be
 689 |  * 	manipulated with *skb*\ **->data** and *skb*\ **->data_end**
 690 |  * 	pointing respectively to the first byte of packet data and to
 691 |  * 	the byte after the last byte of packet data. However, it
 692 |  * 	remains useful if one wishes to read large quantities of data
 693 |  * 	at once from a packet into the eBPF stack.
 694 |  *
 695 |  * Returns
 696 |  * 	0 on success, or a negative error in case of failure.
 697 |  */
 698 | static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26;
 699 | 
 700 | /*
 701 |  * bpf_get_stackid
 702 |  *
 703 |  * 	Walk a user or a kernel stack and return its id. To achieve
 704 |  * 	this, the helper needs *ctx*, which is a pointer to the context
 705 |  * 	on which the tracing program is executed, and a pointer to a
 706 |  * 	*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
 707 |  *
 708 |  * 	The last argument, *flags*, holds the number of stack frames to
 709 |  * 	skip (from 0 to 255), masked with
 710 |  * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
 711 |  * 	a combination of the following flags:
 712 |  *
 713 |  * 	**BPF_F_USER_STACK**
 714 |  * 		Collect a user space stack instead of a kernel stack.
 715 |  * 	**BPF_F_FAST_STACK_CMP**
 716 |  * 		Compare stacks by hash only.
 717 |  * 	**BPF_F_REUSE_STACKID**
 718 |  * 		If two different stacks hash into the same *stackid*,
 719 |  * 		discard the old one.
 720 |  *
 721 |  * 	The stack id retrieved is a 32 bit long integer handle which
 722 |  * 	can be further combined with other data (including other stack
 723 |  * 	ids) and used as a key into maps. This can be useful for
 724 |  * 	generating a variety of graphs (such as flame graphs or off-cpu
 725 |  * 	graphs).
 726 |  *
 727 |  * 	For walking a stack, this helper is an improvement over
 728 |  * 	**bpf_probe_read**\ (), which can be used with unrolled loops
 729 |  * 	but is not efficient and consumes a lot of eBPF instructions.
 730 |  * 	Instead, **bpf_get_stackid**\ () can collect up to
 731 |  * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
 732 |  * 	this limit can be controlled with the **sysctl** program, and
 733 |  * 	that it should be manually increased in order to profile long
 734 |  * 	user stacks (such as stacks for Java programs). To do so, use:
 735 |  *
 736 |  * 	::
 737 |  *
 738 |  * 		# sysctl kernel.perf_event_max_stack=<new value>
 739 |  *
 740 |  * Returns
 741 |  * 	The positive or null stack id on success, or a negative error
 742 |  * 	in case of failure.
 743 |  */
 744 | static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27;
 745 | 
 746 | /*
 747 |  * bpf_csum_diff
 748 |  *
 749 |  * 	Compute a checksum difference, from the raw buffer pointed by
 750 |  * 	*from*, of length *from_size* (that must be a multiple of 4),
 751 |  * 	towards the raw buffer pointed by *to*, of size *to_size*
 752 |  * 	(same remark). An optional *seed* can be added to the value
 753 |  * 	(this can be cascaded, the seed may come from a previous call
 754 |  * 	to the helper).
 755 |  *
 756 |  * 	This is flexible enough to be used in several ways:
 757 |  *
 758 |  * 	* With *from_size* == 0, *to_size* > 0 and *seed* set to
 759 |  * 	  checksum, it can be used when pushing new data.
 760 |  * 	* With *from_size* > 0, *to_size* == 0 and *seed* set to
 761 |  * 	  checksum, it can be used when removing data from a packet.
 762 |  * 	* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
 763 |  * 	  can be used to compute a diff. Note that *from_size* and
 764 |  * 	  *to_size* do not need to be equal.
 765 |  *
 766 |  * 	This helper can be used in combination with
 767 |  * 	**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
 768 |  * 	which one can feed in the difference computed with
 769 |  * 	**bpf_csum_diff**\ ().
 770 |  *
 771 |  * Returns
 772 |  * 	The checksum result, or a negative error code in case of
 773 |  * 	failure.
 774 |  */
 775 | static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28;
 776 | 
 777 | /*
 778 |  * bpf_skb_get_tunnel_opt
 779 |  *
 780 |  * 	Retrieve tunnel options metadata for the packet associated to
 781 |  * 	*skb*, and store the raw tunnel option data to the buffer *opt*
 782 |  * 	of *size*.
 783 |  *
 784 |  * 	This helper can be used with encapsulation devices that can
 785 |  * 	operate in "collect metadata" mode (please refer to the related
 786 |  * 	note in the description of **bpf_skb_get_tunnel_key**\ () for
 787 |  * 	more details). A particular example where this can be used is
 788 |  * 	in combination with the Geneve encapsulation protocol, where it
 789 |  * 	allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
 790 |  * 	and retrieving arbitrary TLVs (Type-Length-Value headers) from
 791 |  * 	the eBPF program. This allows for full customization of these
 792 |  * 	headers.
 793 |  *
 794 |  * Returns
 795 |  * 	The size of the option data retrieved.
 796 |  */
 797 | static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29;
 798 | 
 799 | /*
 800 |  * bpf_skb_set_tunnel_opt
 801 |  *
 802 |  * 	Set tunnel options metadata for the packet associated to *skb*
 803 |  * 	to the option data contained in the raw buffer *opt* of *size*.
 804 |  *
 805 |  * 	See also the description of the **bpf_skb_get_tunnel_opt**\ ()
 806 |  * 	helper for additional information.
 807 |  *
 808 |  * Returns
 809 |  * 	0 on success, or a negative error in case of failure.
 810 |  */
 811 | static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30;
 812 | 
 813 | /*
 814 |  * bpf_skb_change_proto
 815 |  *
 816 |  * 	Change the protocol of the *skb* to *proto*. Currently
 817 |  * 	supported are transition from IPv4 to IPv6, and from IPv6 to
 818 |  * 	IPv4. The helper takes care of the groundwork for the
 819 |  * 	transition, including resizing the socket buffer. The eBPF
 820 |  * 	program is expected to fill the new headers, if any, via
 821 |  * 	**skb_store_bytes**\ () and to recompute the checksums with
 822 |  * 	**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
 823 |  * 	(). The main case for this helper is to perform NAT64
 824 |  * 	operations out of an eBPF program.
 825 |  *
 826 |  * 	Internally, the GSO type is marked as dodgy so that headers are
 827 |  * 	checked and segments are recalculated by the GSO/GRO engine.
 828 |  * 	The size for GSO target is adapted as well.
 829 |  *
 830 |  * 	All values for *flags* are reserved for future usage, and must
 831 |  * 	be left at zero.
 832 |  *
 833 |  * 	A call to this helper is susceptible to change the underlying
 834 |  * 	packet buffer. Therefore, at load time, all checks on pointers
 835 |  * 	previously done by the verifier are invalidated and must be
 836 |  * 	performed again, if the helper is used in combination with
 837 |  * 	direct packet access.
 838 |  *
 839 |  * Returns
 840 |  * 	0 on success, or a negative error in case of failure.
 841 |  */
 842 | static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31;
 843 | 
 844 | /*
 845 |  * bpf_skb_change_type
 846 |  *
 847 |  * 	Change the packet type for the packet associated to *skb*. This
 848 |  * 	comes down to setting *skb*\ **->pkt_type** to *type*, except
 849 |  * 	the eBPF program does not have a write access to *skb*\
 850 |  * 	**->pkt_type** beside this helper. Using a helper here allows
 851 |  * 	for graceful handling of errors.
 852 |  *
 853 |  * 	The major use case is to change incoming *skb*s to
 854 |  * 	**PACKET_HOST** in a programmatic way instead of having to
 855 |  * 	recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
 856 |  * 	example.
 857 |  *
 858 |  * 	Note that *type* only allows certain values. At this time, they
 859 |  * 	are:
 860 |  *
 861 |  * 	**PACKET_HOST**
 862 |  * 		Packet is for us.
 863 |  * 	**PACKET_BROADCAST**
 864 |  * 		Send packet to all.
 865 |  * 	**PACKET_MULTICAST**
 866 |  * 		Send packet to group.
 867 |  * 	**PACKET_OTHERHOST**
 868 |  * 		Send packet to someone else.
 869 |  *
 870 |  * Returns
 871 |  * 	0 on success, or a negative error in case of failure.
 872 |  */
 873 | static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32;
 874 | 
 875 | /*
 876 |  * bpf_skb_under_cgroup
 877 |  *
 878 |  * 	Check whether *skb* is a descendant of the cgroup2 held by
 879 |  * 	*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
 880 |  *
 881 |  * Returns
 882 |  * 	The return value depends on the result of the test, and can be:
 883 |  *
 884 |  * 	* 0, if the *skb* failed the cgroup2 descendant test.
 885 |  * 	* 1, if the *skb* succeeded the cgroup2 descendant test.
 886 |  * 	* A negative error code, if an error occurred.
 887 |  */
 888 | static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33;
 889 | 
 890 | /*
 891 |  * bpf_get_hash_recalc
 892 |  *
 893 |  * 	Retrieve the hash of the packet, *skb*\ **->hash**. If it is
 894 |  * 	not set, in particular if the hash was cleared due to mangling,
 895 |  * 	recompute this hash. Later accesses to the hash can be done
 896 |  * 	directly with *skb*\ **->hash**.
 897 |  *
 898 |  * 	Calling **bpf_set_hash_invalid**\ (), changing a packet
 899 |  * 	prototype with **bpf_skb_change_proto**\ (), or calling
 900 |  * 	**bpf_skb_store_bytes**\ () with the
 901 |  * 	**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
 902 |  * 	the hash and to trigger a new computation for the next call to
 903 |  * 	**bpf_get_hash_recalc**\ ().
 904 |  *
 905 |  * Returns
 906 |  * 	The 32-bit hash.
 907 |  */
 908 | static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34;
 909 | 
 910 | /*
 911 |  * bpf_get_current_task
 912 |  *
 913 |  *
 914 |  * Returns
 915 |  * 	A pointer to the current task struct.
 916 |  */
 917 | static __u64 (*bpf_get_current_task)(void) = (void *) 35;
 918 | 
 919 | /*
 920 |  * bpf_probe_write_user
 921 |  *
 922 |  * 	Attempt in a safe way to write *len* bytes from the buffer
 923 |  * 	*src* to *dst* in memory. It only works for threads that are in
 924 |  * 	user context, and *dst* must be a valid user space address.
 925 |  *
 926 |  * 	This helper should not be used to implement any kind of
 927 |  * 	security mechanism because of TOC-TOU attacks, but rather to
 928 |  * 	debug, divert, and manipulate execution of semi-cooperative
 929 |  * 	processes.
 930 |  *
 931 |  * 	Keep in mind that this feature is meant for experiments, and it
 932 |  * 	has a risk of crashing the system and running programs.
 933 |  * 	Therefore, when an eBPF program using this helper is attached,
 934 |  * 	a warning including PID and process name is printed to kernel
 935 |  * 	logs.
 936 |  *
 937 |  * Returns
 938 |  * 	0 on success, or a negative error in case of failure.
 939 |  */
 940 | static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36;
 941 | 
 942 | /*
 943 |  * bpf_current_task_under_cgroup
 944 |  *
 945 |  * 	Check whether the probe is being run is the context of a given
 946 |  * 	subset of the cgroup2 hierarchy. The cgroup2 to test is held by
 947 |  * 	*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
 948 |  *
 949 |  * Returns
 950 |  * 	The return value depends on the result of the test, and can be:
 951 |  *
 952 |  * 	* 0, if the *skb* task belongs to the cgroup2.
 953 |  * 	* 1, if the *skb* task does not belong to the cgroup2.
 954 |  * 	* A negative error code, if an error occurred.
 955 |  */
 956 | static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37;
 957 | 
 958 | /*
 959 |  * bpf_skb_change_tail
 960 |  *
 961 |  * 	Resize (trim or grow) the packet associated to *skb* to the
 962 |  * 	new *len*. The *flags* are reserved for future usage, and must
 963 |  * 	be left at zero.
 964 |  *
 965 |  * 	The basic idea is that the helper performs the needed work to
 966 |  * 	change the size of the packet, then the eBPF program rewrites
 967 |  * 	the rest via helpers like **bpf_skb_store_bytes**\ (),
 968 |  * 	**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
 969 |  * 	and others. This helper is a slow path utility intended for
 970 |  * 	replies with control messages. And because it is targeted for
 971 |  * 	slow path, the helper itself can afford to be slow: it
 972 |  * 	implicitly linearizes, unclones and drops offloads from the
 973 |  * 	*skb*.
 974 |  *
 975 |  * 	A call to this helper is susceptible to change the underlying
 976 |  * 	packet buffer. Therefore, at load time, all checks on pointers
 977 |  * 	previously done by the verifier are invalidated and must be
 978 |  * 	performed again, if the helper is used in combination with
 979 |  * 	direct packet access.
 980 |  *
 981 |  * Returns
 982 |  * 	0 on success, or a negative error in case of failure.
 983 |  */
 984 | static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38;
 985 | 
 986 | /*
 987 |  * bpf_skb_pull_data
 988 |  *
 989 |  * 	Pull in non-linear data in case the *skb* is non-linear and not
 990 |  * 	all of *len* are part of the linear section. Make *len* bytes
 991 |  * 	from *skb* readable and writable. If a zero value is passed for
 992 |  * 	*len*, then the whole length of the *skb* is pulled.
 993 |  *
 994 |  * 	This helper is only needed for reading and writing with direct
 995 |  * 	packet access.
 996 |  *
 997 |  * 	For direct packet access, testing that offsets to access
 998 |  * 	are within packet boundaries (test on *skb*\ **->data_end**) is
 999 |  * 	susceptible to fail if offsets are invalid, or if the requested
1000 |  * 	data is in non-linear parts of the *skb*. On failure the
1001 |  * 	program can just bail out, or in the case of a non-linear
1002 |  * 	buffer, use a helper to make the data available. The
1003 |  * 	**bpf_skb_load_bytes**\ () helper is a first solution to access
1004 |  * 	the data. Another one consists in using **bpf_skb_pull_data**
1005 |  * 	to pull in once the non-linear parts, then retesting and
1006 |  * 	eventually access the data.
1007 |  *
1008 |  * 	At the same time, this also makes sure the *skb* is uncloned,
1009 |  * 	which is a necessary condition for direct write. As this needs
1010 |  * 	to be an invariant for the write part only, the verifier
1011 |  * 	detects writes and adds a prologue that is calling
1012 |  * 	**bpf_skb_pull_data()** to effectively unclone the *skb* from
1013 |  * 	the very beginning in case it is indeed cloned.
1014 |  *
1015 |  * 	A call to this helper is susceptible to change the underlying
1016 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1017 |  * 	previously done by the verifier are invalidated and must be
1018 |  * 	performed again, if the helper is used in combination with
1019 |  * 	direct packet access.
1020 |  *
1021 |  * Returns
1022 |  * 	0 on success, or a negative error in case of failure.
1023 |  */
1024 | static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39;
1025 | 
1026 | /*
1027 |  * bpf_csum_update
1028 |  *
1029 |  * 	Add the checksum *csum* into *skb*\ **->csum** in case the
1030 |  * 	driver has supplied a checksum for the entire packet into that
1031 |  * 	field. Return an error otherwise. This helper is intended to be
1032 |  * 	used in combination with **bpf_csum_diff**\ (), in particular
1033 |  * 	when the checksum needs to be updated after data has been
1034 |  * 	written into the packet through direct packet access.
1035 |  *
1036 |  * Returns
1037 |  * 	The checksum on success, or a negative error code in case of
1038 |  * 	failure.
1039 |  */
1040 | static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40;
1041 | 
1042 | /*
1043 |  * bpf_set_hash_invalid
1044 |  *
1045 |  * 	Invalidate the current *skb*\ **->hash**. It can be used after
1046 |  * 	mangling on headers through direct packet access, in order to
1047 |  * 	indicate that the hash is outdated and to trigger a
1048 |  * 	recalculation the next time the kernel tries to access this
1049 |  * 	hash or when the **bpf_get_hash_recalc**\ () helper is called.
1050 |  *
1051 |  */
1052 | static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41;
1053 | 
1054 | /*
1055 |  * bpf_get_numa_node_id
1056 |  *
1057 |  * 	Return the id of the current NUMA node. The primary use case
1058 |  * 	for this helper is the selection of sockets for the local NUMA
1059 |  * 	node, when the program is attached to sockets using the
1060 |  * 	**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
1061 |  * 	but the helper is also available to other eBPF program types,
1062 |  * 	similarly to **bpf_get_smp_processor_id**\ ().
1063 |  *
1064 |  * Returns
1065 |  * 	The id of current NUMA node.
1066 |  */
1067 | static long (*bpf_get_numa_node_id)(void) = (void *) 42;
1068 | 
1069 | /*
1070 |  * bpf_skb_change_head
1071 |  *
1072 |  * 	Grows headroom of packet associated to *skb* and adjusts the
1073 |  * 	offset of the MAC header accordingly, adding *len* bytes of
1074 |  * 	space. It automatically extends and reallocates memory as
1075 |  * 	required.
1076 |  *
1077 |  * 	This helper can be used on a layer 3 *skb* to push a MAC header
1078 |  * 	for redirection into a layer 2 device.
1079 |  *
1080 |  * 	All values for *flags* are reserved for future usage, and must
1081 |  * 	be left at zero.
1082 |  *
1083 |  * 	A call to this helper is susceptible to change the underlying
1084 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1085 |  * 	previously done by the verifier are invalidated and must be
1086 |  * 	performed again, if the helper is used in combination with
1087 |  * 	direct packet access.
1088 |  *
1089 |  * Returns
1090 |  * 	0 on success, or a negative error in case of failure.
1091 |  */
1092 | static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43;
1093 | 
1094 | /*
1095 |  * bpf_xdp_adjust_head
1096 |  *
1097 |  * 	Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
1098 |  * 	it is possible to use a negative value for *delta*. This helper
1099 |  * 	can be used to prepare the packet for pushing or popping
1100 |  * 	headers.
1101 |  *
1102 |  * 	A call to this helper is susceptible to change the underlying
1103 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1104 |  * 	previously done by the verifier are invalidated and must be
1105 |  * 	performed again, if the helper is used in combination with
1106 |  * 	direct packet access.
1107 |  *
1108 |  * Returns
1109 |  * 	0 on success, or a negative error in case of failure.
1110 |  */
1111 | static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44;
1112 | 
1113 | /*
1114 |  * bpf_probe_read_str
1115 |  *
1116 |  * 	Copy a NUL terminated string from an unsafe kernel address
1117 |  * 	*unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
1118 |  * 	more details.
1119 |  *
1120 |  * 	Generally, use **bpf_probe_read_user_str**\ () or
1121 |  * 	**bpf_probe_read_kernel_str**\ () instead.
1122 |  *
1123 |  * Returns
1124 |  * 	On success, the strictly positive length of the string,
1125 |  * 	including the trailing NUL character. On error, a negative
1126 |  * 	value.
1127 |  */
1128 | static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45;
1129 | 
1130 | /*
1131 |  * bpf_get_socket_cookie
1132 |  *
1133 |  * 	If the **struct sk_buff** pointed by *skb* has a known socket,
1134 |  * 	retrieve the cookie (generated by the kernel) of this socket.
1135 |  * 	If no cookie has been set yet, generate a new cookie. Once
1136 |  * 	generated, the socket cookie remains stable for the life of the
1137 |  * 	socket. This helper can be useful for monitoring per socket
1138 |  * 	networking traffic statistics as it provides a global socket
1139 |  * 	identifier that can be assumed unique.
1140 |  *
1141 |  * Returns
1142 |  * 	A 8-byte long non-decreasing number on success, or 0 if the
1143 |  * 	socket field is missing inside *skb*.
1144 |  */
1145 | static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46;
1146 | 
1147 | /*
1148 |  * bpf_get_socket_uid
1149 |  *
1150 |  *
1151 |  * Returns
1152 |  * 	The owner UID of the socket associated to *skb*. If the socket
1153 |  * 	is **NULL**, or if it is not a full socket (i.e. if it is a
1154 |  * 	time-wait or a request socket instead), **overflowuid** value
1155 |  * 	is returned (note that **overflowuid** might also be the actual
1156 |  * 	UID value for the socket).
1157 |  */
1158 | static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47;
1159 | 
1160 | /*
1161 |  * bpf_set_hash
1162 |  *
1163 |  * 	Set the full hash for *skb* (set the field *skb*\ **->hash**)
1164 |  * 	to value *hash*.
1165 |  *
1166 |  * Returns
1167 |  * 	0
1168 |  */
1169 | static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48;
1170 | 
1171 | /*
1172 |  * bpf_setsockopt
1173 |  *
1174 |  * 	Emulate a call to **setsockopt()** on the socket associated to
1175 |  * 	*bpf_socket*, which must be a full socket. The *level* at
1176 |  * 	which the option resides and the name *optname* of the option
1177 |  * 	must be specified, see **setsockopt(2)** for more information.
1178 |  * 	The option value of length *optlen* is pointed by *optval*.
1179 |  *
1180 |  * 	*bpf_socket* should be one of the following:
1181 |  *
1182 |  * 	* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
1183 |  * 	* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
1184 |  * 	  and **BPF_CGROUP_INET6_CONNECT**.
1185 |  *
1186 |  * 	This helper actually implements a subset of **setsockopt()**.
1187 |  * 	It supports the following *level*\ s:
1188 |  *
1189 |  * 	* **SOL_SOCKET**, which supports the following *optname*\ s:
1190 |  * 	  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
1191 |  * 	  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**,
1192 |  * 	  **SO_BINDTODEVICE**, **SO_KEEPALIVE**.
1193 |  * 	* **IPPROTO_TCP**, which supports the following *optname*\ s:
1194 |  * 	  **TCP_CONGESTION**, **TCP_BPF_IW**,
1195 |  * 	  **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
1196 |  * 	  **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
1197 |  * 	  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
1198 |  * 	* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
1199 |  * 	* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
1200 |  *
1201 |  * Returns
1202 |  * 	0 on success, or a negative error in case of failure.
1203 |  */
1204 | static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49;
1205 | 
1206 | /*
1207 |  * bpf_skb_adjust_room
1208 |  *
1209 |  * 	Grow or shrink the room for data in the packet associated to
1210 |  * 	*skb* by *len_diff*, and according to the selected *mode*.
1211 |  *
1212 |  * 	By default, the helper will reset any offloaded checksum
1213 |  * 	indicator of the skb to CHECKSUM_NONE. This can be avoided
1214 |  * 	by the following flag:
1215 |  *
1216 |  * 	* **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
1217 |  * 	  checksum data of the skb to CHECKSUM_NONE.
1218 |  *
1219 |  * 	There are two supported modes at this time:
1220 |  *
1221 |  * 	* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
1222 |  * 	  (room space is added or removed below the layer 2 header).
1223 |  *
1224 |  * 	* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
1225 |  * 	  (room space is added or removed below the layer 3 header).
1226 |  *
1227 |  * 	The following flags are supported at this time:
1228 |  *
1229 |  * 	* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
1230 |  * 	  Adjusting mss in this way is not allowed for datagrams.
1231 |  *
1232 |  * 	* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**,
1233 |  * 	  **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**:
1234 |  * 	  Any new space is reserved to hold a tunnel header.
1235 |  * 	  Configure skb offsets and other fields accordingly.
1236 |  *
1237 |  * 	* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**,
1238 |  * 	  **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**:
1239 |  * 	  Use with ENCAP_L3 flags to further specify the tunnel type.
1240 |  *
1241 |  * 	* **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*):
1242 |  * 	  Use with ENCAP_L3/L4 flags to further specify the tunnel
1243 |  * 	  type; *len* is the length of the inner MAC header.
1244 |  *
1245 |  * 	A call to this helper is susceptible to change the underlying
1246 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1247 |  * 	previously done by the verifier are invalidated and must be
1248 |  * 	performed again, if the helper is used in combination with
1249 |  * 	direct packet access.
1250 |  *
1251 |  * Returns
1252 |  * 	0 on success, or a negative error in case of failure.
1253 |  */
1254 | static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50;
1255 | 
1256 | /*
1257 |  * bpf_redirect_map
1258 |  *
1259 |  * 	Redirect the packet to the endpoint referenced by *map* at
1260 |  * 	index *key*. Depending on its type, this *map* can contain
1261 |  * 	references to net devices (for forwarding packets through other
1262 |  * 	ports), or to CPUs (for redirecting XDP frames to another CPU;
1263 |  * 	but this is only implemented for native XDP (with driver
1264 |  * 	support) as of this writing).
1265 |  *
1266 |  * 	The lower two bits of *flags* are used as the return code if
1267 |  * 	the map lookup fails. This is so that the return value can be
1268 |  * 	one of the XDP program return codes up to **XDP_TX**, as chosen
1269 |  * 	by the caller. Any higher bits in the *flags* argument must be
1270 |  * 	unset.
1271 |  *
1272 |  * 	See also **bpf_redirect**\ (), which only supports redirecting
1273 |  * 	to an ifindex, but doesn't require a map to do so.
1274 |  *
1275 |  * Returns
1276 |  * 	**XDP_REDIRECT** on success, or the value of the two lower bits
1277 |  * 	of the *flags* argument on error.
1278 |  */
1279 | static long (*bpf_redirect_map)(void *map, __u32 key, __u64 flags) = (void *) 51;
1280 | 
1281 | /*
1282 |  * bpf_sk_redirect_map
1283 |  *
1284 |  * 	Redirect the packet to the socket referenced by *map* (of type
1285 |  * 	**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
1286 |  * 	egress interfaces can be used for redirection. The
1287 |  * 	**BPF_F_INGRESS** value in *flags* is used to make the
1288 |  * 	distinction (ingress path is selected if the flag is present,
1289 |  * 	egress path otherwise). This is the only flag supported for now.
1290 |  *
1291 |  * Returns
1292 |  * 	**SK_PASS** on success, or **SK_DROP** on error.
1293 |  */
1294 | static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52;
1295 | 
1296 | /*
1297 |  * bpf_sock_map_update
1298 |  *
1299 |  * 	Add an entry to, or update a *map* referencing sockets. The
1300 |  * 	*skops* is used as a new value for the entry associated to
1301 |  * 	*key*. *flags* is one of:
1302 |  *
1303 |  * 	**BPF_NOEXIST**
1304 |  * 		The entry for *key* must not exist in the map.
1305 |  * 	**BPF_EXIST**
1306 |  * 		The entry for *key* must already exist in the map.
1307 |  * 	**BPF_ANY**
1308 |  * 		No condition on the existence of the entry for *key*.
1309 |  *
1310 |  * 	If the *map* has eBPF programs (parser and verdict), those will
1311 |  * 	be inherited by the socket being added. If the socket is
1312 |  * 	already attached to eBPF programs, this results in an error.
1313 |  *
1314 |  * Returns
1315 |  * 	0 on success, or a negative error in case of failure.
1316 |  */
1317 | static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53;
1318 | 
1319 | /*
1320 |  * bpf_xdp_adjust_meta
1321 |  *
1322 |  * 	Adjust the address pointed by *xdp_md*\ **->data_meta** by
1323 |  * 	*delta* (which can be positive or negative). Note that this
1324 |  * 	operation modifies the address stored in *xdp_md*\ **->data**,
1325 |  * 	so the latter must be loaded only after the helper has been
1326 |  * 	called.
1327 |  *
1328 |  * 	The use of *xdp_md*\ **->data_meta** is optional and programs
1329 |  * 	are not required to use it. The rationale is that when the
1330 |  * 	packet is processed with XDP (e.g. as DoS filter), it is
1331 |  * 	possible to push further meta data along with it before passing
1332 |  * 	to the stack, and to give the guarantee that an ingress eBPF
1333 |  * 	program attached as a TC classifier on the same device can pick
1334 |  * 	this up for further post-processing. Since TC works with socket
1335 |  * 	buffers, it remains possible to set from XDP the **mark** or
1336 |  * 	**priority** pointers, or other pointers for the socket buffer.
1337 |  * 	Having this scratch space generic and programmable allows for
1338 |  * 	more flexibility as the user is free to store whatever meta
1339 |  * 	data they need.
1340 |  *
1341 |  * 	A call to this helper is susceptible to change the underlying
1342 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1343 |  * 	previously done by the verifier are invalidated and must be
1344 |  * 	performed again, if the helper is used in combination with
1345 |  * 	direct packet access.
1346 |  *
1347 |  * Returns
1348 |  * 	0 on success, or a negative error in case of failure.
1349 |  */
1350 | static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54;
1351 | 
1352 | /*
1353 |  * bpf_perf_event_read_value
1354 |  *
1355 |  * 	Read the value of a perf event counter, and store it into *buf*
1356 |  * 	of size *buf_size*. This helper relies on a *map* of type
1357 |  * 	**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
1358 |  * 	counter is selected when *map* is updated with perf event file
1359 |  * 	descriptors. The *map* is an array whose size is the number of
1360 |  * 	available CPUs, and each cell contains a value relative to one
1361 |  * 	CPU. The value to retrieve is indicated by *flags*, that
1362 |  * 	contains the index of the CPU to look up, masked with
1363 |  * 	**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
1364 |  * 	**BPF_F_CURRENT_CPU** to indicate that the value for the
1365 |  * 	current CPU should be retrieved.
1366 |  *
1367 |  * 	This helper behaves in a way close to
1368 |  * 	**bpf_perf_event_read**\ () helper, save that instead of
1369 |  * 	just returning the value observed, it fills the *buf*
1370 |  * 	structure. This allows for additional data to be retrieved: in
1371 |  * 	particular, the enabled and running times (in *buf*\
1372 |  * 	**->enabled** and *buf*\ **->running**, respectively) are
1373 |  * 	copied. In general, **bpf_perf_event_read_value**\ () is
1374 |  * 	recommended over **bpf_perf_event_read**\ (), which has some
1375 |  * 	ABI issues and provides fewer functionalities.
1376 |  *
1377 |  * 	These values are interesting, because hardware PMU (Performance
1378 |  * 	Monitoring Unit) counters are limited resources. When there are
1379 |  * 	more PMU based perf events opened than available counters,
1380 |  * 	kernel will multiplex these events so each event gets certain
1381 |  * 	percentage (but not all) of the PMU time. In case that
1382 |  * 	multiplexing happens, the number of samples or counter value
1383 |  * 	will not reflect the case compared to when no multiplexing
1384 |  * 	occurs. This makes comparison between different runs difficult.
1385 |  * 	Typically, the counter value should be normalized before
1386 |  * 	comparing to other experiments. The usual normalization is done
1387 |  * 	as follows.
1388 |  *
1389 |  * 	::
1390 |  *
1391 |  * 		normalized_counter = counter * t_enabled / t_running
1392 |  *
1393 |  * 	Where t_enabled is the time enabled for event and t_running is
1394 |  * 	the time running for event since last normalization. The
1395 |  * 	enabled and running times are accumulated since the perf event
1396 |  * 	open. To achieve scaling factor between two invocations of an
1397 |  * 	eBPF program, users can use CPU id as the key (which is
1398 |  * 	typical for perf array usage model) to remember the previous
1399 |  * 	value and do the calculation inside the eBPF program.
1400 |  *
1401 |  * Returns
1402 |  * 	0 on success, or a negative error in case of failure.
1403 |  */
1404 | static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55;
1405 | 
1406 | /*
1407 |  * bpf_perf_prog_read_value
1408 |  *
1409 |  * 	For en eBPF program attached to a perf event, retrieve the
1410 |  * 	value of the event counter associated to *ctx* and store it in
1411 |  * 	the structure pointed by *buf* and of size *buf_size*. Enabled
1412 |  * 	and running times are also stored in the structure (see
1413 |  * 	description of helper **bpf_perf_event_read_value**\ () for
1414 |  * 	more details).
1415 |  *
1416 |  * Returns
1417 |  * 	0 on success, or a negative error in case of failure.
1418 |  */
1419 | static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56;
1420 | 
1421 | /*
1422 |  * bpf_getsockopt
1423 |  *
1424 |  * 	Emulate a call to **getsockopt()** on the socket associated to
1425 |  * 	*bpf_socket*, which must be a full socket. The *level* at
1426 |  * 	which the option resides and the name *optname* of the option
1427 |  * 	must be specified, see **getsockopt(2)** for more information.
1428 |  * 	The retrieved value is stored in the structure pointed by
1429 |  * 	*opval* and of length *optlen*.
1430 |  *
1431 |  * 	*bpf_socket* should be one of the following:
1432 |  *
1433 |  * 	* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
1434 |  * 	* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
1435 |  * 	  and **BPF_CGROUP_INET6_CONNECT**.
1436 |  *
1437 |  * 	This helper actually implements a subset of **getsockopt()**.
1438 |  * 	It supports the following *level*\ s:
1439 |  *
1440 |  * 	* **IPPROTO_TCP**, which supports *optname*
1441 |  * 	  **TCP_CONGESTION**.
1442 |  * 	* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
1443 |  * 	* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
1444 |  *
1445 |  * Returns
1446 |  * 	0 on success, or a negative error in case of failure.
1447 |  */
1448 | static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57;
1449 | 
1450 | /*
1451 |  * bpf_override_return
1452 |  *
1453 |  * 	Used for error injection, this helper uses kprobes to override
1454 |  * 	the return value of the probed function, and to set it to *rc*.
1455 |  * 	The first argument is the context *regs* on which the kprobe
1456 |  * 	works.
1457 |  *
1458 |  * 	This helper works by setting the PC (program counter)
1459 |  * 	to an override function which is run in place of the original
1460 |  * 	probed function. This means the probed function is not run at
1461 |  * 	all. The replacement function just returns with the required
1462 |  * 	value.
1463 |  *
1464 |  * 	This helper has security implications, and thus is subject to
1465 |  * 	restrictions. It is only available if the kernel was compiled
1466 |  * 	with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
1467 |  * 	option, and in this case it only works on functions tagged with
1468 |  * 	**ALLOW_ERROR_INJECTION** in the kernel code.
1469 |  *
1470 |  * 	Also, the helper is only available for the architectures having
1471 |  * 	the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
1472 |  * 	x86 architecture is the only one to support this feature.
1473 |  *
1474 |  * Returns
1475 |  * 	0
1476 |  */
1477 | static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58;
1478 | 
1479 | /*
1480 |  * bpf_sock_ops_cb_flags_set
1481 |  *
1482 |  * 	Attempt to set the value of the **bpf_sock_ops_cb_flags** field
1483 |  * 	for the full TCP socket associated to *bpf_sock_ops* to
1484 |  * 	*argval*.
1485 |  *
1486 |  * 	The primary use of this field is to determine if there should
1487 |  * 	be calls to eBPF programs of type
1488 |  * 	**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
1489 |  * 	code. A program of the same type can change its value, per
1490 |  * 	connection and as necessary, when the connection is
1491 |  * 	established. This field is directly accessible for reading, but
1492 |  * 	this helper must be used for updates in order to return an
1493 |  * 	error if an eBPF program tries to set a callback that is not
1494 |  * 	supported in the current kernel.
1495 |  *
1496 |  * 	*argval* is a flag array which can combine these flags:
1497 |  *
1498 |  * 	* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
1499 |  * 	* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
1500 |  * 	* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
1501 |  * 	* **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
1502 |  *
1503 |  * 	Therefore, this function can be used to clear a callback flag by
1504 |  * 	setting the appropriate bit to zero. e.g. to disable the RTO
1505 |  * 	callback:
1506 |  *
1507 |  * 	**bpf_sock_ops_cb_flags_set(bpf_sock,**
1508 |  * 		**bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)**
1509 |  *
1510 |  * 	Here are some examples of where one could call such eBPF
1511 |  * 	program:
1512 |  *
1513 |  * 	* When RTO fires.
1514 |  * 	* When a packet is retransmitted.
1515 |  * 	* When the connection terminates.
1516 |  * 	* When a packet is sent.
1517 |  * 	* When a packet is received.
1518 |  *
1519 |  * Returns
1520 |  * 	Code **-EINVAL** if the socket is not a full TCP socket;
1521 |  * 	otherwise, a positive number containing the bits that could not
1522 |  * 	be set is returned (which comes down to 0 if all bits were set
1523 |  * 	as required).
1524 |  */
1525 | static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59;
1526 | 
1527 | /*
1528 |  * bpf_msg_redirect_map
1529 |  *
1530 |  * 	This helper is used in programs implementing policies at the
1531 |  * 	socket level. If the message *msg* is allowed to pass (i.e. if
1532 |  * 	the verdict eBPF program returns **SK_PASS**), redirect it to
1533 |  * 	the socket referenced by *map* (of type
1534 |  * 	**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
1535 |  * 	egress interfaces can be used for redirection. The
1536 |  * 	**BPF_F_INGRESS** value in *flags* is used to make the
1537 |  * 	distinction (ingress path is selected if the flag is present,
1538 |  * 	egress path otherwise). This is the only flag supported for now.
1539 |  *
1540 |  * Returns
1541 |  * 	**SK_PASS** on success, or **SK_DROP** on error.
1542 |  */
1543 | static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60;
1544 | 
1545 | /*
1546 |  * bpf_msg_apply_bytes
1547 |  *
1548 |  * 	For socket policies, apply the verdict of the eBPF program to
1549 |  * 	the next *bytes* (number of bytes) of message *msg*.
1550 |  *
1551 |  * 	For example, this helper can be used in the following cases:
1552 |  *
1553 |  * 	* A single **sendmsg**\ () or **sendfile**\ () system call
1554 |  * 	  contains multiple logical messages that the eBPF program is
1555 |  * 	  supposed to read and for which it should apply a verdict.
1556 |  * 	* An eBPF program only cares to read the first *bytes* of a
1557 |  * 	  *msg*. If the message has a large payload, then setting up
1558 |  * 	  and calling the eBPF program repeatedly for all bytes, even
1559 |  * 	  though the verdict is already known, would create unnecessary
1560 |  * 	  overhead.
1561 |  *
1562 |  * 	When called from within an eBPF program, the helper sets a
1563 |  * 	counter internal to the BPF infrastructure, that is used to
1564 |  * 	apply the last verdict to the next *bytes*. If *bytes* is
1565 |  * 	smaller than the current data being processed from a
1566 |  * 	**sendmsg**\ () or **sendfile**\ () system call, the first
1567 |  * 	*bytes* will be sent and the eBPF program will be re-run with
1568 |  * 	the pointer for start of data pointing to byte number *bytes*
1569 |  * 	**+ 1**. If *bytes* is larger than the current data being
1570 |  * 	processed, then the eBPF verdict will be applied to multiple
1571 |  * 	**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
1572 |  * 	consumed.
1573 |  *
1574 |  * 	Note that if a socket closes with the internal counter holding
1575 |  * 	a non-zero value, this is not a problem because data is not
1576 |  * 	being buffered for *bytes* and is sent as it is received.
1577 |  *
1578 |  * Returns
1579 |  * 	0
1580 |  */
1581 | static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61;
1582 | 
1583 | /*
1584 |  * bpf_msg_cork_bytes
1585 |  *
1586 |  * 	For socket policies, prevent the execution of the verdict eBPF
1587 |  * 	program for message *msg* until *bytes* (byte number) have been
1588 |  * 	accumulated.
1589 |  *
1590 |  * 	This can be used when one needs a specific number of bytes
1591 |  * 	before a verdict can be assigned, even if the data spans
1592 |  * 	multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
1593 |  * 	case would be a user calling **sendmsg**\ () repeatedly with
1594 |  * 	1-byte long message segments. Obviously, this is bad for
1595 |  * 	performance, but it is still valid. If the eBPF program needs
1596 |  * 	*bytes* bytes to validate a header, this helper can be used to
1597 |  * 	prevent the eBPF program to be called again until *bytes* have
1598 |  * 	been accumulated.
1599 |  *
1600 |  * Returns
1601 |  * 	0
1602 |  */
1603 | static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62;
1604 | 
1605 | /*
1606 |  * bpf_msg_pull_data
1607 |  *
1608 |  * 	For socket policies, pull in non-linear data from user space
1609 |  * 	for *msg* and set pointers *msg*\ **->data** and *msg*\
1610 |  * 	**->data_end** to *start* and *end* bytes offsets into *msg*,
1611 |  * 	respectively.
1612 |  *
1613 |  * 	If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
1614 |  * 	*msg* it can only parse data that the (**data**, **data_end**)
1615 |  * 	pointers have already consumed. For **sendmsg**\ () hooks this
1616 |  * 	is likely the first scatterlist element. But for calls relying
1617 |  * 	on the **sendpage** handler (e.g. **sendfile**\ ()) this will
1618 |  * 	be the range (**0**, **0**) because the data is shared with
1619 |  * 	user space and by default the objective is to avoid allowing
1620 |  * 	user space to modify data while (or after) eBPF verdict is
1621 |  * 	being decided. This helper can be used to pull in data and to
1622 |  * 	set the start and end pointer to given values. Data will be
1623 |  * 	copied if necessary (i.e. if data was not linear and if start
1624 |  * 	and end pointers do not point to the same chunk).
1625 |  *
1626 |  * 	A call to this helper is susceptible to change the underlying
1627 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1628 |  * 	previously done by the verifier are invalidated and must be
1629 |  * 	performed again, if the helper is used in combination with
1630 |  * 	direct packet access.
1631 |  *
1632 |  * 	All values for *flags* are reserved for future usage, and must
1633 |  * 	be left at zero.
1634 |  *
1635 |  * Returns
1636 |  * 	0 on success, or a negative error in case of failure.
1637 |  */
1638 | static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63;
1639 | 
1640 | /*
1641 |  * bpf_bind
1642 |  *
1643 |  * 	Bind the socket associated to *ctx* to the address pointed by
1644 |  * 	*addr*, of length *addr_len*. This allows for making outgoing
1645 |  * 	connection from the desired IP address, which can be useful for
1646 |  * 	example when all processes inside a cgroup should use one
1647 |  * 	single IP address on a host that has multiple IP configured.
1648 |  *
1649 |  * 	This helper works for IPv4 and IPv6, TCP and UDP sockets. The
1650 |  * 	domain (*addr*\ **->sa_family**) must be **AF_INET** (or
1651 |  * 	**AF_INET6**). It's advised to pass zero port (**sin_port**
1652 |  * 	or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
1653 |  * 	behavior and lets the kernel efficiently pick up an unused
1654 |  * 	port as long as 4-tuple is unique. Passing non-zero port might
1655 |  * 	lead to degraded performance.
1656 |  *
1657 |  * Returns
1658 |  * 	0 on success, or a negative error in case of failure.
1659 |  */
1660 | static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64;
1661 | 
1662 | /*
1663 |  * bpf_xdp_adjust_tail
1664 |  *
1665 |  * 	Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
1666 |  * 	possible to both shrink and grow the packet tail.
1667 |  * 	Shrink done via *delta* being a negative integer.
1668 |  *
1669 |  * 	A call to this helper is susceptible to change the underlying
1670 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1671 |  * 	previously done by the verifier are invalidated and must be
1672 |  * 	performed again, if the helper is used in combination with
1673 |  * 	direct packet access.
1674 |  *
1675 |  * Returns
1676 |  * 	0 on success, or a negative error in case of failure.
1677 |  */
1678 | static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65;
1679 | 
1680 | /*
1681 |  * bpf_skb_get_xfrm_state
1682 |  *
1683 |  * 	Retrieve the XFRM state (IP transform framework, see also
1684 |  * 	**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
1685 |  *
1686 |  * 	The retrieved value is stored in the **struct bpf_xfrm_state**
1687 |  * 	pointed by *xfrm_state* and of length *size*.
1688 |  *
1689 |  * 	All values for *flags* are reserved for future usage, and must
1690 |  * 	be left at zero.
1691 |  *
1692 |  * 	This helper is available only if the kernel was compiled with
1693 |  * 	**CONFIG_XFRM** configuration option.
1694 |  *
1695 |  * Returns
1696 |  * 	0 on success, or a negative error in case of failure.
1697 |  */
1698 | static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66;
1699 | 
1700 | /*
1701 |  * bpf_get_stack
1702 |  *
1703 |  * 	Return a user or a kernel stack in bpf program provided buffer.
1704 |  * 	To achieve this, the helper needs *ctx*, which is a pointer
1705 |  * 	to the context on which the tracing program is executed.
1706 |  * 	To store the stacktrace, the bpf program provides *buf* with
1707 |  * 	a nonnegative *size*.
1708 |  *
1709 |  * 	The last argument, *flags*, holds the number of stack frames to
1710 |  * 	skip (from 0 to 255), masked with
1711 |  * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
1712 |  * 	the following flags:
1713 |  *
1714 |  * 	**BPF_F_USER_STACK**
1715 |  * 		Collect a user space stack instead of a kernel stack.
1716 |  * 	**BPF_F_USER_BUILD_ID**
1717 |  * 		Collect buildid+offset instead of ips for user stack,
1718 |  * 		only valid if **BPF_F_USER_STACK** is also specified.
1719 |  *
1720 |  * 	**bpf_get_stack**\ () can collect up to
1721 |  * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
1722 |  * 	to sufficient large buffer size. Note that
1723 |  * 	this limit can be controlled with the **sysctl** program, and
1724 |  * 	that it should be manually increased in order to profile long
1725 |  * 	user stacks (such as stacks for Java programs). To do so, use:
1726 |  *
1727 |  * 	::
1728 |  *
1729 |  * 		# sysctl kernel.perf_event_max_stack=<new value>
1730 |  *
1731 |  * Returns
1732 |  * 	A non-negative value equal to or less than *size* on success,
1733 |  * 	or a negative error in case of failure.
1734 |  */
1735 | static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67;
1736 | 
1737 | /*
1738 |  * bpf_skb_load_bytes_relative
1739 |  *
1740 |  * 	This helper is similar to **bpf_skb_load_bytes**\ () in that
1741 |  * 	it provides an easy way to load *len* bytes from *offset*
1742 |  * 	from the packet associated to *skb*, into the buffer pointed
1743 |  * 	by *to*. The difference to **bpf_skb_load_bytes**\ () is that
1744 |  * 	a fifth argument *start_header* exists in order to select a
1745 |  * 	base offset to start from. *start_header* can be one of:
1746 |  *
1747 |  * 	**BPF_HDR_START_MAC**
1748 |  * 		Base offset to load data from is *skb*'s mac header.
1749 |  * 	**BPF_HDR_START_NET**
1750 |  * 		Base offset to load data from is *skb*'s network header.
1751 |  *
1752 |  * 	In general, "direct packet access" is the preferred method to
1753 |  * 	access packet data, however, this helper is in particular useful
1754 |  * 	in socket filters where *skb*\ **->data** does not always point
1755 |  * 	to the start of the mac header and where "direct packet access"
1756 |  * 	is not available.
1757 |  *
1758 |  * Returns
1759 |  * 	0 on success, or a negative error in case of failure.
1760 |  */
1761 | static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68;
1762 | 
1763 | /*
1764 |  * bpf_fib_lookup
1765 |  *
1766 |  * 	Do FIB lookup in kernel tables using parameters in *params*.
1767 |  * 	If lookup is successful and result shows packet is to be
1768 |  * 	forwarded, the neighbor tables are searched for the nexthop.
1769 |  * 	If successful (ie., FIB lookup shows forwarding and nexthop
1770 |  * 	is resolved), the nexthop address is returned in ipv4_dst
1771 |  * 	or ipv6_dst based on family, smac is set to mac address of
1772 |  * 	egress device, dmac is set to nexthop mac address, rt_metric
1773 |  * 	is set to metric from route (IPv4/IPv6 only), and ifindex
1774 |  * 	is set to the device index of the nexthop from the FIB lookup.
1775 |  *
1776 |  * 	*plen* argument is the size of the passed in struct.
1777 |  * 	*flags* argument can be a combination of one or more of the
1778 |  * 	following values:
1779 |  *
1780 |  * 	**BPF_FIB_LOOKUP_DIRECT**
1781 |  * 		Do a direct table lookup vs full lookup using FIB
1782 |  * 		rules.
1783 |  * 	**BPF_FIB_LOOKUP_OUTPUT**
1784 |  * 		Perform lookup from an egress perspective (default is
1785 |  * 		ingress).
1786 |  *
1787 |  * 	*ctx* is either **struct xdp_md** for XDP programs or
1788 |  * 	**struct sk_buff** tc cls_act programs.
1789 |  *
1790 |  * Returns
1791 |  * 	* < 0 if any input argument is invalid
1792 |  * 	*   0 on success (packet is forwarded, nexthop neighbor exists)
1793 |  * 	* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
1794 |  * 	  packet is not forwarded or needs assist from full stack
1795 |  */
1796 | static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69;
1797 | 
1798 | /*
1799 |  * bpf_sock_hash_update
1800 |  *
1801 |  * 	Add an entry to, or update a sockhash *map* referencing sockets.
1802 |  * 	The *skops* is used as a new value for the entry associated to
1803 |  * 	*key*. *flags* is one of:
1804 |  *
1805 |  * 	**BPF_NOEXIST**
1806 |  * 		The entry for *key* must not exist in the map.
1807 |  * 	**BPF_EXIST**
1808 |  * 		The entry for *key* must already exist in the map.
1809 |  * 	**BPF_ANY**
1810 |  * 		No condition on the existence of the entry for *key*.
1811 |  *
1812 |  * 	If the *map* has eBPF programs (parser and verdict), those will
1813 |  * 	be inherited by the socket being added. If the socket is
1814 |  * 	already attached to eBPF programs, this results in an error.
1815 |  *
1816 |  * Returns
1817 |  * 	0 on success, or a negative error in case of failure.
1818 |  */
1819 | static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70;
1820 | 
1821 | /*
1822 |  * bpf_msg_redirect_hash
1823 |  *
1824 |  * 	This helper is used in programs implementing policies at the
1825 |  * 	socket level. If the message *msg* is allowed to pass (i.e. if
1826 |  * 	the verdict eBPF program returns **SK_PASS**), redirect it to
1827 |  * 	the socket referenced by *map* (of type
1828 |  * 	**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
1829 |  * 	egress interfaces can be used for redirection. The
1830 |  * 	**BPF_F_INGRESS** value in *flags* is used to make the
1831 |  * 	distinction (ingress path is selected if the flag is present,
1832 |  * 	egress path otherwise). This is the only flag supported for now.
1833 |  *
1834 |  * Returns
1835 |  * 	**SK_PASS** on success, or **SK_DROP** on error.
1836 |  */
1837 | static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71;
1838 | 
1839 | /*
1840 |  * bpf_sk_redirect_hash
1841 |  *
1842 |  * 	This helper is used in programs implementing policies at the
1843 |  * 	skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
1844 |  * 	if the verdeict eBPF program returns **SK_PASS**), redirect it
1845 |  * 	to the socket referenced by *map* (of type
1846 |  * 	**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
1847 |  * 	egress interfaces can be used for redirection. The
1848 |  * 	**BPF_F_INGRESS** value in *flags* is used to make the
1849 |  * 	distinction (ingress path is selected if the flag is present,
1850 |  * 	egress otherwise). This is the only flag supported for now.
1851 |  *
1852 |  * Returns
1853 |  * 	**SK_PASS** on success, or **SK_DROP** on error.
1854 |  */
1855 | static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72;
1856 | 
1857 | /*
1858 |  * bpf_lwt_push_encap
1859 |  *
1860 |  * 	Encapsulate the packet associated to *skb* within a Layer 3
1861 |  * 	protocol header. This header is provided in the buffer at
1862 |  * 	address *hdr*, with *len* its size in bytes. *type* indicates
1863 |  * 	the protocol of the header and can be one of:
1864 |  *
1865 |  * 	**BPF_LWT_ENCAP_SEG6**
1866 |  * 		IPv6 encapsulation with Segment Routing Header
1867 |  * 		(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
1868 |  * 		the IPv6 header is computed by the kernel.
1869 |  * 	**BPF_LWT_ENCAP_SEG6_INLINE**
1870 |  * 		Only works if *skb* contains an IPv6 packet. Insert a
1871 |  * 		Segment Routing Header (**struct ipv6_sr_hdr**) inside
1872 |  * 		the IPv6 header.
1873 |  * 	**BPF_LWT_ENCAP_IP**
1874 |  * 		IP encapsulation (GRE/GUE/IPIP/etc). The outer header
1875 |  * 		must be IPv4 or IPv6, followed by zero or more
1876 |  * 		additional headers, up to **LWT_BPF_MAX_HEADROOM**
1877 |  * 		total bytes in all prepended headers. Please note that
1878 |  * 		if **skb_is_gso**\ (*skb*) is true, no more than two
1879 |  * 		headers can be prepended, and the inner header, if
1880 |  * 		present, should be either GRE or UDP/GUE.
1881 |  *
1882 |  * 	**BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs
1883 |  * 	of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can
1884 |  * 	be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and
1885 |  * 	**BPF_PROG_TYPE_LWT_XMIT**.
1886 |  *
1887 |  * 	A call to this helper is susceptible to change the underlying
1888 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1889 |  * 	previously done by the verifier are invalidated and must be
1890 |  * 	performed again, if the helper is used in combination with
1891 |  * 	direct packet access.
1892 |  *
1893 |  * Returns
1894 |  * 	0 on success, or a negative error in case of failure.
1895 |  */
1896 | static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73;
1897 | 
1898 | /*
1899 |  * bpf_lwt_seg6_store_bytes
1900 |  *
1901 |  * 	Store *len* bytes from address *from* into the packet
1902 |  * 	associated to *skb*, at *offset*. Only the flags, tag and TLVs
1903 |  * 	inside the outermost IPv6 Segment Routing Header can be
1904 |  * 	modified through this helper.
1905 |  *
1906 |  * 	A call to this helper is susceptible to change the underlying
1907 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1908 |  * 	previously done by the verifier are invalidated and must be
1909 |  * 	performed again, if the helper is used in combination with
1910 |  * 	direct packet access.
1911 |  *
1912 |  * Returns
1913 |  * 	0 on success, or a negative error in case of failure.
1914 |  */
1915 | static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74;
1916 | 
1917 | /*
1918 |  * bpf_lwt_seg6_adjust_srh
1919 |  *
1920 |  * 	Adjust the size allocated to TLVs in the outermost IPv6
1921 |  * 	Segment Routing Header contained in the packet associated to
1922 |  * 	*skb*, at position *offset* by *delta* bytes. Only offsets
1923 |  * 	after the segments are accepted. *delta* can be as well
1924 |  * 	positive (growing) as negative (shrinking).
1925 |  *
1926 |  * 	A call to this helper is susceptible to change the underlying
1927 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1928 |  * 	previously done by the verifier are invalidated and must be
1929 |  * 	performed again, if the helper is used in combination with
1930 |  * 	direct packet access.
1931 |  *
1932 |  * Returns
1933 |  * 	0 on success, or a negative error in case of failure.
1934 |  */
1935 | static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75;
1936 | 
1937 | /*
1938 |  * bpf_lwt_seg6_action
1939 |  *
1940 |  * 	Apply an IPv6 Segment Routing action of type *action* to the
1941 |  * 	packet associated to *skb*. Each action takes a parameter
1942 |  * 	contained at address *param*, and of length *param_len* bytes.
1943 |  * 	*action* can be one of:
1944 |  *
1945 |  * 	**SEG6_LOCAL_ACTION_END_X**
1946 |  * 		End.X action: Endpoint with Layer-3 cross-connect.
1947 |  * 		Type of *param*: **struct in6_addr**.
1948 |  * 	**SEG6_LOCAL_ACTION_END_T**
1949 |  * 		End.T action: Endpoint with specific IPv6 table lookup.
1950 |  * 		Type of *param*: **int**.
1951 |  * 	**SEG6_LOCAL_ACTION_END_B6**
1952 |  * 		End.B6 action: Endpoint bound to an SRv6 policy.
1953 |  * 		Type of *param*: **struct ipv6_sr_hdr**.
1954 |  * 	**SEG6_LOCAL_ACTION_END_B6_ENCAP**
1955 |  * 		End.B6.Encap action: Endpoint bound to an SRv6
1956 |  * 		encapsulation policy.
1957 |  * 		Type of *param*: **struct ipv6_sr_hdr**.
1958 |  *
1959 |  * 	A call to this helper is susceptible to change the underlying
1960 |  * 	packet buffer. Therefore, at load time, all checks on pointers
1961 |  * 	previously done by the verifier are invalidated and must be
1962 |  * 	performed again, if the helper is used in combination with
1963 |  * 	direct packet access.
1964 |  *
1965 |  * Returns
1966 |  * 	0 on success, or a negative error in case of failure.
1967 |  */
1968 | static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76;
1969 | 
1970 | /*
1971 |  * bpf_rc_repeat
1972 |  *
1973 |  * 	This helper is used in programs implementing IR decoding, to
1974 |  * 	report a successfully decoded repeat key message. This delays
1975 |  * 	the generation of a key up event for previously generated
1976 |  * 	key down event.
1977 |  *
1978 |  * 	Some IR protocols like NEC have a special IR message for
1979 |  * 	repeating last button, for when a button is held down.
1980 |  *
1981 |  * 	The *ctx* should point to the lirc sample as passed into
1982 |  * 	the program.
1983 |  *
1984 |  * 	This helper is only available is the kernel was compiled with
1985 |  * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
1986 |  * 	"**y**".
1987 |  *
1988 |  * Returns
1989 |  * 	0
1990 |  */
1991 | static long (*bpf_rc_repeat)(void *ctx) = (void *) 77;
1992 | 
1993 | /*
1994 |  * bpf_rc_keydown
1995 |  *
1996 |  * 	This helper is used in programs implementing IR decoding, to
1997 |  * 	report a successfully decoded key press with *scancode*,
1998 |  * 	*toggle* value in the given *protocol*. The scancode will be
1999 |  * 	translated to a keycode using the rc keymap, and reported as
2000 |  * 	an input key down event. After a period a key up event is
2001 |  * 	generated. This period can be extended by calling either
2002 |  * 	**bpf_rc_keydown**\ () again with the same values, or calling
2003 |  * 	**bpf_rc_repeat**\ ().
2004 |  *
2005 |  * 	Some protocols include a toggle bit, in case the button was
2006 |  * 	released and pressed again between consecutive scancodes.
2007 |  *
2008 |  * 	The *ctx* should point to the lirc sample as passed into
2009 |  * 	the program.
2010 |  *
2011 |  * 	The *protocol* is the decoded protocol number (see
2012 |  * 	**enum rc_proto** for some predefined values).
2013 |  *
2014 |  * 	This helper is only available is the kernel was compiled with
2015 |  * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
2016 |  * 	"**y**".
2017 |  *
2018 |  * Returns
2019 |  * 	0
2020 |  */
2021 | static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78;
2022 | 
2023 | /*
2024 |  * bpf_skb_cgroup_id
2025 |  *
2026 |  * 	Return the cgroup v2 id of the socket associated with the *skb*.
2027 |  * 	This is roughly similar to the **bpf_get_cgroup_classid**\ ()
2028 |  * 	helper for cgroup v1 by providing a tag resp. identifier that
2029 |  * 	can be matched on or used for map lookups e.g. to implement
2030 |  * 	policy. The cgroup v2 id of a given path in the hierarchy is
2031 |  * 	exposed in user space through the f_handle API in order to get
2032 |  * 	to the same 64-bit id.
2033 |  *
2034 |  * 	This helper can be used on TC egress path, but not on ingress,
2035 |  * 	and is available only if the kernel was compiled with the
2036 |  * 	**CONFIG_SOCK_CGROUP_DATA** configuration option.
2037 |  *
2038 |  * Returns
2039 |  * 	The id is returned or 0 in case the id could not be retrieved.
2040 |  */
2041 | static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79;
2042 | 
2043 | /*
2044 |  * bpf_get_current_cgroup_id
2045 |  *
2046 |  *
2047 |  * Returns
2048 |  * 	A 64-bit integer containing the current cgroup id based
2049 |  * 	on the cgroup within which the current task is running.
2050 |  */
2051 | static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80;
2052 | 
2053 | /*
2054 |  * bpf_get_local_storage
2055 |  *
2056 |  * 	Get the pointer to the local storage area.
2057 |  * 	The type and the size of the local storage is defined
2058 |  * 	by the *map* argument.
2059 |  * 	The *flags* meaning is specific for each map type,
2060 |  * 	and has to be 0 for cgroup local storage.
2061 |  *
2062 |  * 	Depending on the BPF program type, a local storage area
2063 |  * 	can be shared between multiple instances of the BPF program,
2064 |  * 	running simultaneously.
2065 |  *
2066 |  * 	A user should care about the synchronization by himself.
2067 |  * 	For example, by using the **BPF_STX_XADD** instruction to alter
2068 |  * 	the shared data.
2069 |  *
2070 |  * Returns
2071 |  * 	A pointer to the local storage area.
2072 |  */
2073 | static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81;
2074 | 
2075 | /*
2076 |  * bpf_sk_select_reuseport
2077 |  *
2078 |  * 	Select a **SO_REUSEPORT** socket from a
2079 |  * 	**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
2080 |  * 	It checks the selected socket is matching the incoming
2081 |  * 	request in the socket buffer.
2082 |  *
2083 |  * Returns
2084 |  * 	0 on success, or a negative error in case of failure.
2085 |  */
2086 | static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82;
2087 | 
2088 | /*
2089 |  * bpf_skb_ancestor_cgroup_id
2090 |  *
2091 |  * 	Return id of cgroup v2 that is ancestor of cgroup associated
2092 |  * 	with the *skb* at the *ancestor_level*.  The root cgroup is at
2093 |  * 	*ancestor_level* zero and each step down the hierarchy
2094 |  * 	increments the level. If *ancestor_level* == level of cgroup
2095 |  * 	associated with *skb*, then return value will be same as that
2096 |  * 	of **bpf_skb_cgroup_id**\ ().
2097 |  *
2098 |  * 	The helper is useful to implement policies based on cgroups
2099 |  * 	that are upper in hierarchy than immediate cgroup associated
2100 |  * 	with *skb*.
2101 |  *
2102 |  * 	The format of returned id and helper limitations are same as in
2103 |  * 	**bpf_skb_cgroup_id**\ ().
2104 |  *
2105 |  * Returns
2106 |  * 	The id is returned or 0 in case the id could not be retrieved.
2107 |  */
2108 | static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83;
2109 | 
2110 | /*
2111 |  * bpf_sk_lookup_tcp
2112 |  *
2113 |  * 	Look for TCP socket matching *tuple*, optionally in a child
2114 |  * 	network namespace *netns*. The return value must be checked,
2115 |  * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
2116 |  *
2117 |  * 	The *ctx* should point to the context of the program, such as
2118 |  * 	the skb or socket (depending on the hook in use). This is used
2119 |  * 	to determine the base network namespace for the lookup.
2120 |  *
2121 |  * 	*tuple_size* must be one of:
2122 |  *
2123 |  * 	**sizeof**\ (*tuple*\ **->ipv4**)
2124 |  * 		Look for an IPv4 socket.
2125 |  * 	**sizeof**\ (*tuple*\ **->ipv6**)
2126 |  * 		Look for an IPv6 socket.
2127 |  *
2128 |  * 	If the *netns* is a negative signed 32-bit integer, then the
2129 |  * 	socket lookup table in the netns associated with the *ctx*
2130 |  * 	will be used. For the TC hooks, this is the netns of the device
2131 |  * 	in the skb. For socket hooks, this is the netns of the socket.
2132 |  * 	If *netns* is any other signed 32-bit value greater than or
2133 |  * 	equal to zero then it specifies the ID of the netns relative to
2134 |  * 	the netns associated with the *ctx*. *netns* values beyond the
2135 |  * 	range of 32-bit integers are reserved for future use.
2136 |  *
2137 |  * 	All values for *flags* are reserved for future usage, and must
2138 |  * 	be left at zero.
2139 |  *
2140 |  * 	This helper is available only if the kernel was compiled with
2141 |  * 	**CONFIG_NET** configuration option.
2142 |  *
2143 |  * Returns
2144 |  * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2145 |  * 	For sockets with reuseport option, the **struct bpf_sock**
2146 |  * 	result is from *reuse*\ **->socks**\ [] using the hash of the
2147 |  * 	tuple.
2148 |  */
2149 | static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84;
2150 | 
2151 | /*
2152 |  * bpf_sk_lookup_udp
2153 |  *
2154 |  * 	Look for UDP socket matching *tuple*, optionally in a child
2155 |  * 	network namespace *netns*. The return value must be checked,
2156 |  * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
2157 |  *
2158 |  * 	The *ctx* should point to the context of the program, such as
2159 |  * 	the skb or socket (depending on the hook in use). This is used
2160 |  * 	to determine the base network namespace for the lookup.
2161 |  *
2162 |  * 	*tuple_size* must be one of:
2163 |  *
2164 |  * 	**sizeof**\ (*tuple*\ **->ipv4**)
2165 |  * 		Look for an IPv4 socket.
2166 |  * 	**sizeof**\ (*tuple*\ **->ipv6**)
2167 |  * 		Look for an IPv6 socket.
2168 |  *
2169 |  * 	If the *netns* is a negative signed 32-bit integer, then the
2170 |  * 	socket lookup table in the netns associated with the *ctx*
2171 |  * 	will be used. For the TC hooks, this is the netns of the device
2172 |  * 	in the skb. For socket hooks, this is the netns of the socket.
2173 |  * 	If *netns* is any other signed 32-bit value greater than or
2174 |  * 	equal to zero then it specifies the ID of the netns relative to
2175 |  * 	the netns associated with the *ctx*. *netns* values beyond the
2176 |  * 	range of 32-bit integers are reserved for future use.
2177 |  *
2178 |  * 	All values for *flags* are reserved for future usage, and must
2179 |  * 	be left at zero.
2180 |  *
2181 |  * 	This helper is available only if the kernel was compiled with
2182 |  * 	**CONFIG_NET** configuration option.
2183 |  *
2184 |  * Returns
2185 |  * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2186 |  * 	For sockets with reuseport option, the **struct bpf_sock**
2187 |  * 	result is from *reuse*\ **->socks**\ [] using the hash of the
2188 |  * 	tuple.
2189 |  */
2190 | static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85;
2191 | 
2192 | /*
2193 |  * bpf_sk_release
2194 |  *
2195 |  * 	Release the reference held by *sock*. *sock* must be a
2196 |  * 	non-**NULL** pointer that was returned from
2197 |  * 	**bpf_sk_lookup_xxx**\ ().
2198 |  *
2199 |  * Returns
2200 |  * 	0 on success, or a negative error in case of failure.
2201 |  */
2202 | static long (*bpf_sk_release)(struct bpf_sock *sock) = (void *) 86;
2203 | 
2204 | /*
2205 |  * bpf_map_push_elem
2206 |  *
2207 |  * 	Push an element *value* in *map*. *flags* is one of:
2208 |  *
2209 |  * 	**BPF_EXIST**
2210 |  * 		If the queue/stack is full, the oldest element is
2211 |  * 		removed to make room for this.
2212 |  *
2213 |  * Returns
2214 |  * 	0 on success, or a negative error in case of failure.
2215 |  */
2216 | static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87;
2217 | 
2218 | /*
2219 |  * bpf_map_pop_elem
2220 |  *
2221 |  * 	Pop an element from *map*.
2222 |  *
2223 |  * Returns
2224 |  * 	0 on success, or a negative error in case of failure.
2225 |  */
2226 | static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88;
2227 | 
2228 | /*
2229 |  * bpf_map_peek_elem
2230 |  *
2231 |  * 	Get an element from *map* without removing it.
2232 |  *
2233 |  * Returns
2234 |  * 	0 on success, or a negative error in case of failure.
2235 |  */
2236 | static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89;
2237 | 
2238 | /*
2239 |  * bpf_msg_push_data
2240 |  *
2241 |  * 	For socket policies, insert *len* bytes into *msg* at offset
2242 |  * 	*start*.
2243 |  *
2244 |  * 	If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
2245 |  * 	*msg* it may want to insert metadata or options into the *msg*.
2246 |  * 	This can later be read and used by any of the lower layer BPF
2247 |  * 	hooks.
2248 |  *
2249 |  * 	This helper may fail if under memory pressure (a malloc
2250 |  * 	fails) in these cases BPF programs will get an appropriate
2251 |  * 	error and BPF programs will need to handle them.
2252 |  *
2253 |  * Returns
2254 |  * 	0 on success, or a negative error in case of failure.
2255 |  */
2256 | static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90;
2257 | 
2258 | /*
2259 |  * bpf_msg_pop_data
2260 |  *
2261 |  * 	Will remove *len* bytes from a *msg* starting at byte *start*.
2262 |  * 	This may result in **ENOMEM** errors under certain situations if
2263 |  * 	an allocation and copy are required due to a full ring buffer.
2264 |  * 	However, the helper will try to avoid doing the allocation
2265 |  * 	if possible. Other errors can occur if input parameters are
2266 |  * 	invalid either due to *start* byte not being valid part of *msg*
2267 |  * 	payload and/or *pop* value being to large.
2268 |  *
2269 |  * Returns
2270 |  * 	0 on success, or a negative error in case of failure.
2271 |  */
2272 | static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91;
2273 | 
2274 | /*
2275 |  * bpf_rc_pointer_rel
2276 |  *
2277 |  * 	This helper is used in programs implementing IR decoding, to
2278 |  * 	report a successfully decoded pointer movement.
2279 |  *
2280 |  * 	The *ctx* should point to the lirc sample as passed into
2281 |  * 	the program.
2282 |  *
2283 |  * 	This helper is only available is the kernel was compiled with
2284 |  * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
2285 |  * 	"**y**".
2286 |  *
2287 |  * Returns
2288 |  * 	0
2289 |  */
2290 | static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92;
2291 | 
2292 | /*
2293 |  * bpf_spin_lock
2294 |  *
2295 |  * 	Acquire a spinlock represented by the pointer *lock*, which is
2296 |  * 	stored as part of a value of a map. Taking the lock allows to
2297 |  * 	safely update the rest of the fields in that value. The
2298 |  * 	spinlock can (and must) later be released with a call to
2299 |  * 	**bpf_spin_unlock**\ (\ *lock*\ ).
2300 |  *
2301 |  * 	Spinlocks in BPF programs come with a number of restrictions
2302 |  * 	and constraints:
2303 |  *
2304 |  * 	* **bpf_spin_lock** objects are only allowed inside maps of
2305 |  * 	  types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this
2306 |  * 	  list could be extended in the future).
2307 |  * 	* BTF description of the map is mandatory.
2308 |  * 	* The BPF program can take ONE lock at a time, since taking two
2309 |  * 	  or more could cause dead locks.
2310 |  * 	* Only one **struct bpf_spin_lock** is allowed per map element.
2311 |  * 	* When the lock is taken, calls (either BPF to BPF or helpers)
2312 |  * 	  are not allowed.
2313 |  * 	* The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not
2314 |  * 	  allowed inside a spinlock-ed region.
2315 |  * 	* The BPF program MUST call **bpf_spin_unlock**\ () to release
2316 |  * 	  the lock, on all execution paths, before it returns.
2317 |  * 	* The BPF program can access **struct bpf_spin_lock** only via
2318 |  * 	  the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ ()
2319 |  * 	  helpers. Loading or storing data into the **struct
2320 |  * 	  bpf_spin_lock** *lock*\ **;** field of a map is not allowed.
2321 |  * 	* To use the **bpf_spin_lock**\ () helper, the BTF description
2322 |  * 	  of the map value must be a struct and have **struct
2323 |  * 	  bpf_spin_lock** *anyname*\ **;** field at the top level.
2324 |  * 	  Nested lock inside another struct is not allowed.
2325 |  * 	* The **struct bpf_spin_lock** *lock* field in a map value must
2326 |  * 	  be aligned on a multiple of 4 bytes in that value.
2327 |  * 	* Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy
2328 |  * 	  the **bpf_spin_lock** field to user space.
2329 |  * 	* Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from
2330 |  * 	  a BPF program, do not update the **bpf_spin_lock** field.
2331 |  * 	* **bpf_spin_lock** cannot be on the stack or inside a
2332 |  * 	  networking packet (it can only be inside of a map values).
2333 |  * 	* **bpf_spin_lock** is available to root only.
2334 |  * 	* Tracing programs and socket filter programs cannot use
2335 |  * 	  **bpf_spin_lock**\ () due to insufficient preemption checks
2336 |  * 	  (but this may change in the future).
2337 |  * 	* **bpf_spin_lock** is not allowed in inner maps of map-in-map.
2338 |  *
2339 |  * Returns
2340 |  * 	0
2341 |  */
2342 | static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93;
2343 | 
2344 | /*
2345 |  * bpf_spin_unlock
2346 |  *
2347 |  * 	Release the *lock* previously locked by a call to
2348 |  * 	**bpf_spin_lock**\ (\ *lock*\ ).
2349 |  *
2350 |  * Returns
2351 |  * 	0
2352 |  */
2353 | static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94;
2354 | 
2355 | /*
2356 |  * bpf_sk_fullsock
2357 |  *
2358 |  * 	This helper gets a **struct bpf_sock** pointer such
2359 |  * 	that all the fields in this **bpf_sock** can be accessed.
2360 |  *
2361 |  * Returns
2362 |  * 	A **struct bpf_sock** pointer on success, or **NULL** in
2363 |  * 	case of failure.
2364 |  */
2365 | static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95;
2366 | 
2367 | /*
2368 |  * bpf_tcp_sock
2369 |  *
2370 |  * 	This helper gets a **struct bpf_tcp_sock** pointer from a
2371 |  * 	**struct bpf_sock** pointer.
2372 |  *
2373 |  * Returns
2374 |  * 	A **struct bpf_tcp_sock** pointer on success, or **NULL** in
2375 |  * 	case of failure.
2376 |  */
2377 | static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96;
2378 | 
2379 | /*
2380 |  * bpf_skb_ecn_set_ce
2381 |  *
2382 |  * 	Set ECN (Explicit Congestion Notification) field of IP header
2383 |  * 	to **CE** (Congestion Encountered) if current value is **ECT**
2384 |  * 	(ECN Capable Transport). Otherwise, do nothing. Works with IPv6
2385 |  * 	and IPv4.
2386 |  *
2387 |  * Returns
2388 |  * 	1 if the **CE** flag is set (either by the current helper call
2389 |  * 	or because it was already present), 0 if it is not set.
2390 |  */
2391 | static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97;
2392 | 
2393 | /*
2394 |  * bpf_get_listener_sock
2395 |  *
2396 |  * 	Return a **struct bpf_sock** pointer in **TCP_LISTEN** state.
2397 |  * 	**bpf_sk_release**\ () is unnecessary and not allowed.
2398 |  *
2399 |  * Returns
2400 |  * 	A **struct bpf_sock** pointer on success, or **NULL** in
2401 |  * 	case of failure.
2402 |  */
2403 | static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98;
2404 | 
2405 | /*
2406 |  * bpf_skc_lookup_tcp
2407 |  *
2408 |  * 	Look for TCP socket matching *tuple*, optionally in a child
2409 |  * 	network namespace *netns*. The return value must be checked,
2410 |  * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
2411 |  *
2412 |  * 	This function is identical to **bpf_sk_lookup_tcp**\ (), except
2413 |  * 	that it also returns timewait or request sockets. Use
2414 |  * 	**bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the
2415 |  * 	full structure.
2416 |  *
2417 |  * 	This helper is available only if the kernel was compiled with
2418 |  * 	**CONFIG_NET** configuration option.
2419 |  *
2420 |  * Returns
2421 |  * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2422 |  * 	For sockets with reuseport option, the **struct bpf_sock**
2423 |  * 	result is from *reuse*\ **->socks**\ [] using the hash of the
2424 |  * 	tuple.
2425 |  */
2426 | static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99;
2427 | 
2428 | /*
2429 |  * bpf_tcp_check_syncookie
2430 |  *
2431 |  * 	Check whether *iph* and *th* contain a valid SYN cookie ACK for
2432 |  * 	the listening socket in *sk*.
2433 |  *
2434 |  * 	*iph* points to the start of the IPv4 or IPv6 header, while
2435 |  * 	*iph_len* contains **sizeof**\ (**struct iphdr**) or
2436 |  * 	**sizeof**\ (**struct ip6hdr**).
2437 |  *
2438 |  * 	*th* points to the start of the TCP header, while *th_len*
2439 |  * 	contains **sizeof**\ (**struct tcphdr**).
2440 |  *
2441 |  * Returns
2442 |  * 	0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
2443 |  * 	error otherwise.
2444 |  */
2445 | static long (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100;
2446 | 
2447 | /*
2448 |  * bpf_sysctl_get_name
2449 |  *
2450 |  * 	Get name of sysctl in /proc/sys/ and copy it into provided by
2451 |  * 	program buffer *buf* of size *buf_len*.
2452 |  *
2453 |  * 	The buffer is always NUL terminated, unless it's zero-sized.
2454 |  *
2455 |  * 	If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is
2456 |  * 	copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name
2457 |  * 	only (e.g. "tcp_mem").
2458 |  *
2459 |  * Returns
2460 |  * 	Number of character copied (not including the trailing NUL).
2461 |  *
2462 |  * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
2463 |  * 	truncated name in this case).
2464 |  */
2465 | static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101;
2466 | 
2467 | /*
2468 |  * bpf_sysctl_get_current_value
2469 |  *
2470 |  * 	Get current value of sysctl as it is presented in /proc/sys
2471 |  * 	(incl. newline, etc), and copy it as a string into provided
2472 |  * 	by program buffer *buf* of size *buf_len*.
2473 |  *
2474 |  * 	The whole value is copied, no matter what file position user
2475 |  * 	space issued e.g. sys_read at.
2476 |  *
2477 |  * 	The buffer is always NUL terminated, unless it's zero-sized.
2478 |  *
2479 |  * Returns
2480 |  * 	Number of character copied (not including the trailing NUL).
2481 |  *
2482 |  * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
2483 |  * 	truncated name in this case).
2484 |  *
2485 |  * 	**-EINVAL** if current value was unavailable, e.g. because
2486 |  * 	sysctl is uninitialized and read returns -EIO for it.
2487 |  */
2488 | static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102;
2489 | 
2490 | /*
2491 |  * bpf_sysctl_get_new_value
2492 |  *
2493 |  * 	Get new value being written by user space to sysctl (before
2494 |  * 	the actual write happens) and copy it as a string into
2495 |  * 	provided by program buffer *buf* of size *buf_len*.
2496 |  *
2497 |  * 	User space may write new value at file position > 0.
2498 |  *
2499 |  * 	The buffer is always NUL terminated, unless it's zero-sized.
2500 |  *
2501 |  * Returns
2502 |  * 	Number of character copied (not including the trailing NUL).
2503 |  *
2504 |  * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
2505 |  * 	truncated name in this case).
2506 |  *
2507 |  * 	**-EINVAL** if sysctl is being read.
2508 |  */
2509 | static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103;
2510 | 
2511 | /*
2512 |  * bpf_sysctl_set_new_value
2513 |  *
2514 |  * 	Override new value being written by user space to sysctl with
2515 |  * 	value provided by program in buffer *buf* of size *buf_len*.
2516 |  *
2517 |  * 	*buf* should contain a string in same form as provided by user
2518 |  * 	space on sysctl write.
2519 |  *
2520 |  * 	User space may write new value at file position > 0. To override
2521 |  * 	the whole sysctl value file position should be set to zero.
2522 |  *
2523 |  * Returns
2524 |  * 	0 on success.
2525 |  *
2526 |  * 	**-E2BIG** if the *buf_len* is too big.
2527 |  *
2528 |  * 	**-EINVAL** if sysctl is being read.
2529 |  */
2530 | static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104;
2531 | 
2532 | /*
2533 |  * bpf_strtol
2534 |  *
2535 |  * 	Convert the initial part of the string from buffer *buf* of
2536 |  * 	size *buf_len* to a long integer according to the given base
2537 |  * 	and save the result in *res*.
2538 |  *
2539 |  * 	The string may begin with an arbitrary amount of white space
2540 |  * 	(as determined by **isspace**\ (3)) followed by a single
2541 |  * 	optional '**-**' sign.
2542 |  *
2543 |  * 	Five least significant bits of *flags* encode base, other bits
2544 |  * 	are currently unused.
2545 |  *
2546 |  * 	Base must be either 8, 10, 16 or 0 to detect it automatically
2547 |  * 	similar to user space **strtol**\ (3).
2548 |  *
2549 |  * Returns
2550 |  * 	Number of characters consumed on success. Must be positive but
2551 |  * 	no more than *buf_len*.
2552 |  *
2553 |  * 	**-EINVAL** if no valid digits were found or unsupported base
2554 |  * 	was provided.
2555 |  *
2556 |  * 	**-ERANGE** if resulting value was out of range.
2557 |  */
2558 | static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105;
2559 | 
2560 | /*
2561 |  * bpf_strtoul
2562 |  *
2563 |  * 	Convert the initial part of the string from buffer *buf* of
2564 |  * 	size *buf_len* to an unsigned long integer according to the
2565 |  * 	given base and save the result in *res*.
2566 |  *
2567 |  * 	The string may begin with an arbitrary amount of white space
2568 |  * 	(as determined by **isspace**\ (3)).
2569 |  *
2570 |  * 	Five least significant bits of *flags* encode base, other bits
2571 |  * 	are currently unused.
2572 |  *
2573 |  * 	Base must be either 8, 10, 16 or 0 to detect it automatically
2574 |  * 	similar to user space **strtoul**\ (3).
2575 |  *
2576 |  * Returns
2577 |  * 	Number of characters consumed on success. Must be positive but
2578 |  * 	no more than *buf_len*.
2579 |  *
2580 |  * 	**-EINVAL** if no valid digits were found or unsupported base
2581 |  * 	was provided.
2582 |  *
2583 |  * 	**-ERANGE** if resulting value was out of range.
2584 |  */
2585 | static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106;
2586 | 
2587 | /*
2588 |  * bpf_sk_storage_get
2589 |  *
2590 |  * 	Get a bpf-local-storage from a *sk*.
2591 |  *
2592 |  * 	Logically, it could be thought of getting the value from
2593 |  * 	a *map* with *sk* as the **key**.  From this
2594 |  * 	perspective,  the usage is not much different from
2595 |  * 	**bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this
2596 |  * 	helper enforces the key must be a full socket and the map must
2597 |  * 	be a **BPF_MAP_TYPE_SK_STORAGE** also.
2598 |  *
2599 |  * 	Underneath, the value is stored locally at *sk* instead of
2600 |  * 	the *map*.  The *map* is used as the bpf-local-storage
2601 |  * 	"type". The bpf-local-storage "type" (i.e. the *map*) is
2602 |  * 	searched against all bpf-local-storages residing at *sk*.
2603 |  *
2604 |  * 	An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
2605 |  * 	used such that a new bpf-local-storage will be
2606 |  * 	created if one does not exist.  *value* can be used
2607 |  * 	together with **BPF_SK_STORAGE_GET_F_CREATE** to specify
2608 |  * 	the initial value of a bpf-local-storage.  If *value* is
2609 |  * 	**NULL**, the new bpf-local-storage will be zero initialized.
2610 |  *
2611 |  * Returns
2612 |  * 	A bpf-local-storage pointer is returned on success.
2613 |  *
2614 |  * 	**NULL** if not found or there was an error in adding
2615 |  * 	a new bpf-local-storage.
2616 |  */
2617 | static void *(*bpf_sk_storage_get)(void *map, struct bpf_sock *sk, void *value, __u64 flags) = (void *) 107;
2618 | 
2619 | /*
2620 |  * bpf_sk_storage_delete
2621 |  *
2622 |  * 	Delete a bpf-local-storage from a *sk*.
2623 |  *
2624 |  * Returns
2625 |  * 	0 on success.
2626 |  *
2627 |  * 	**-ENOENT** if the bpf-local-storage cannot be found.
2628 |  */
2629 | static long (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) = (void *) 108;
2630 | 
2631 | /*
2632 |  * bpf_send_signal
2633 |  *
2634 |  * 	Send signal *sig* to the process of the current task.
2635 |  * 	The signal may be delivered to any of this process's threads.
2636 |  *
2637 |  * Returns
2638 |  * 	0 on success or successfully queued.
2639 |  *
2640 |  * 	**-EBUSY** if work queue under nmi is full.
2641 |  *
2642 |  * 	**-EINVAL** if *sig* is invalid.
2643 |  *
2644 |  * 	**-EPERM** if no permission to send the *sig*.
2645 |  *
2646 |  * 	**-EAGAIN** if bpf program can try again.
2647 |  */
2648 | static long (*bpf_send_signal)(__u32 sig) = (void *) 109;
2649 | 
2650 | /*
2651 |  * bpf_tcp_gen_syncookie
2652 |  *
2653 |  * 	Try to issue a SYN cookie for the packet with corresponding
2654 |  * 	IP/TCP headers, *iph* and *th*, on the listening socket in *sk*.
2655 |  *
2656 |  * 	*iph* points to the start of the IPv4 or IPv6 header, while
2657 |  * 	*iph_len* contains **sizeof**\ (**struct iphdr**) or
2658 |  * 	**sizeof**\ (**struct ip6hdr**).
2659 |  *
2660 |  * 	*th* points to the start of the TCP header, while *th_len*
2661 |  * 	contains the length of the TCP header.
2662 |  *
2663 |  * Returns
2664 |  * 	On success, lower 32 bits hold the generated SYN cookie in
2665 |  * 	followed by 16 bits which hold the MSS value for that cookie,
2666 |  * 	and the top 16 bits are unused.
2667 |  *
2668 |  * 	On failure, the returned value is one of the following:
2669 |  *
2670 |  * 	**-EINVAL** SYN cookie cannot be issued due to error
2671 |  *
2672 |  * 	**-ENOENT** SYN cookie should not be issued (no SYN flood)
2673 |  *
2674 |  * 	**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
2675 |  *
2676 |  * 	**-EPROTONOSUPPORT** IP packet version is not 4 or 6
2677 |  */
2678 | static __s64 (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110;
2679 | 
2680 | /*
2681 |  * bpf_skb_output
2682 |  *
2683 |  * 	Write raw *data* blob into a special BPF perf event held by
2684 |  * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
2685 |  * 	event must have the following attributes: **PERF_SAMPLE_RAW**
2686 |  * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
2687 |  * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
2688 |  *
2689 |  * 	The *flags* are used to indicate the index in *map* for which
2690 |  * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
2691 |  * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
2692 |  * 	to indicate that the index of the current CPU core should be
2693 |  * 	used.
2694 |  *
2695 |  * 	The value to write, of *size*, is passed through eBPF stack and
2696 |  * 	pointed by *data*.
2697 |  *
2698 |  * 	*ctx* is a pointer to in-kernel struct sk_buff.
2699 |  *
2700 |  * 	This helper is similar to **bpf_perf_event_output**\ () but
2701 |  * 	restricted to raw_tracepoint bpf programs.
2702 |  *
2703 |  * Returns
2704 |  * 	0 on success, or a negative error in case of failure.
2705 |  */
2706 | static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111;
2707 | 
2708 | /*
2709 |  * bpf_probe_read_user
2710 |  *
2711 |  * 	Safely attempt to read *size* bytes from user space address
2712 |  * 	*unsafe_ptr* and store the data in *dst*.
2713 |  *
2714 |  * Returns
2715 |  * 	0 on success, or a negative error in case of failure.
2716 |  */
2717 | static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112;
2718 | 
2719 | /*
2720 |  * bpf_probe_read_kernel
2721 |  *
2722 |  * 	Safely attempt to read *size* bytes from kernel space address
2723 |  * 	*unsafe_ptr* and store the data in *dst*.
2724 |  *
2725 |  * Returns
2726 |  * 	0 on success, or a negative error in case of failure.
2727 |  */
2728 | static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113;
2729 | 
2730 | /*
2731 |  * bpf_probe_read_user_str
2732 |  *
2733 |  * 	Copy a NUL terminated string from an unsafe user address
2734 |  * 	*unsafe_ptr* to *dst*. The *size* should include the
2735 |  * 	terminating NUL byte. In case the string length is smaller than
2736 |  * 	*size*, the target is not padded with further NUL bytes. If the
2737 |  * 	string length is larger than *size*, just *size*-1 bytes are
2738 |  * 	copied and the last byte is set to NUL.
2739 |  *
2740 |  * 	On success, the length of the copied string is returned. This
2741 |  * 	makes this helper useful in tracing programs for reading
2742 |  * 	strings, and more importantly to get its length at runtime. See
2743 |  * 	the following snippet:
2744 |  *
2745 |  * 	::
2746 |  *
2747 |  * 		SEC("kprobe/sys_open")
2748 |  * 		void bpf_sys_open(struct pt_regs *ctx)
2749 |  * 		{
2750 |  * 		        char buf[PATHLEN]; // PATHLEN is defined to 256
2751 |  * 		        int res = bpf_probe_read_user_str(buf, sizeof(buf),
2752 |  * 			                                  ctx->di);
2753 |  *
2754 |  * 			// Consume buf, for example push it to
2755 |  * 			// userspace via bpf_perf_event_output(); we
2756 |  * 			// can use res (the string length) as event
2757 |  * 			// size, after checking its boundaries.
2758 |  * 		}
2759 |  *
2760 |  * 	In comparison, using **bpf_probe_read_user**\ () helper here
2761 |  * 	instead to read the string would require to estimate the length
2762 |  * 	at compile time, and would often result in copying more memory
2763 |  * 	than necessary.
2764 |  *
2765 |  * 	Another useful use case is when parsing individual process
2766 |  * 	arguments or individual environment variables navigating
2767 |  * 	*current*\ **->mm->arg_start** and *current*\
2768 |  * 	**->mm->env_start**: using this helper and the return value,
2769 |  * 	one can quickly iterate at the right offset of the memory area.
2770 |  *
2771 |  * Returns
2772 |  * 	On success, the strictly positive length of the string,
2773 |  * 	including the trailing NUL character. On error, a negative
2774 |  * 	value.
2775 |  */
2776 | static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114;
2777 | 
2778 | /*
2779 |  * bpf_probe_read_kernel_str
2780 |  *
2781 |  * 	Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
2782 |  * 	to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
2783 |  *
2784 |  * Returns
2785 |  * 	On success, the strictly positive length of the string, including
2786 |  * 	the trailing NUL character. On error, a negative value.
2787 |  */
2788 | static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115;
2789 | 
2790 | /*
2791 |  * bpf_tcp_send_ack
2792 |  *
2793 |  * 	Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
2794 |  * 	*rcv_nxt* is the ack_seq to be sent out.
2795 |  *
2796 |  * Returns
2797 |  * 	0 on success, or a negative error in case of failure.
2798 |  */
2799 | static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116;
2800 | 
2801 | /*
2802 |  * bpf_send_signal_thread
2803 |  *
2804 |  * 	Send signal *sig* to the thread corresponding to the current task.
2805 |  *
2806 |  * Returns
2807 |  * 	0 on success or successfully queued.
2808 |  *
2809 |  * 	**-EBUSY** if work queue under nmi is full.
2810 |  *
2811 |  * 	**-EINVAL** if *sig* is invalid.
2812 |  *
2813 |  * 	**-EPERM** if no permission to send the *sig*.
2814 |  *
2815 |  * 	**-EAGAIN** if bpf program can try again.
2816 |  */
2817 | static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117;
2818 | 
2819 | /*
2820 |  * bpf_jiffies64
2821 |  *
2822 |  * 	Obtain the 64bit jiffies
2823 |  *
2824 |  * Returns
2825 |  * 	The 64 bit jiffies
2826 |  */
2827 | static __u64 (*bpf_jiffies64)(void) = (void *) 118;
2828 | 
2829 | /*
2830 |  * bpf_read_branch_records
2831 |  *
2832 |  * 	For an eBPF program attached to a perf event, retrieve the
2833 |  * 	branch records (**struct perf_branch_entry**) associated to *ctx*
2834 |  * 	and store it in the buffer pointed by *buf* up to size
2835 |  * 	*size* bytes.
2836 |  *
2837 |  * Returns
2838 |  * 	On success, number of bytes written to *buf*. On error, a
2839 |  * 	negative value.
2840 |  *
2841 |  * 	The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
2842 |  * 	instead return the number of bytes required to store all the
2843 |  * 	branch entries. If this flag is set, *buf* may be NULL.
2844 |  *
2845 |  * 	**-EINVAL** if arguments invalid or **size** not a multiple
2846 |  * 	of **sizeof**\ (**struct perf_branch_entry**\ ).
2847 |  *
2848 |  * 	**-ENOENT** if architecture does not support branch records.
2849 |  */
2850 | static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119;
2851 | 
2852 | /*
2853 |  * bpf_get_ns_current_pid_tgid
2854 |  *
2855 |  * 	Returns 0 on success, values for *pid* and *tgid* as seen from the current
2856 |  * 	*namespace* will be returned in *nsdata*.
2857 |  *
2858 |  * Returns
2859 |  * 	0 on success, or one of the following in case of failure:
2860 |  *
2861 |  * 	**-EINVAL** if dev and inum supplied don't match dev_t and inode number
2862 |  * 	with nsfs of current task, or if dev conversion to dev_t lost high bits.
2863 |  *
2864 |  * 	**-ENOENT** if pidns does not exists for the current task.
2865 |  */
2866 | static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120;
2867 | 
2868 | /*
2869 |  * bpf_xdp_output
2870 |  *
2871 |  * 	Write raw *data* blob into a special BPF perf event held by
2872 |  * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
2873 |  * 	event must have the following attributes: **PERF_SAMPLE_RAW**
2874 |  * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
2875 |  * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
2876 |  *
2877 |  * 	The *flags* are used to indicate the index in *map* for which
2878 |  * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
2879 |  * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
2880 |  * 	to indicate that the index of the current CPU core should be
2881 |  * 	used.
2882 |  *
2883 |  * 	The value to write, of *size*, is passed through eBPF stack and
2884 |  * 	pointed by *data*.
2885 |  *
2886 |  * 	*ctx* is a pointer to in-kernel struct xdp_buff.
2887 |  *
2888 |  * 	This helper is similar to **bpf_perf_eventoutput**\ () but
2889 |  * 	restricted to raw_tracepoint bpf programs.
2890 |  *
2891 |  * Returns
2892 |  * 	0 on success, or a negative error in case of failure.
2893 |  */
2894 | static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121;
2895 | 
2896 | /*
2897 |  * bpf_get_netns_cookie
2898 |  *
2899 |  * 	Retrieve the cookie (generated by the kernel) of the network
2900 |  * 	namespace the input *ctx* is associated with. The network
2901 |  * 	namespace cookie remains stable for its lifetime and provides
2902 |  * 	a global identifier that can be assumed unique. If *ctx* is
2903 |  * 	NULL, then the helper returns the cookie for the initial
2904 |  * 	network namespace. The cookie itself is very similar to that
2905 |  * 	of **bpf_get_socket_cookie**\ () helper, but for network
2906 |  * 	namespaces instead of sockets.
2907 |  *
2908 |  * Returns
2909 |  * 	A 8-byte long opaque number.
2910 |  */
2911 | static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122;
2912 | 
2913 | /*
2914 |  * bpf_get_current_ancestor_cgroup_id
2915 |  *
2916 |  * 	Return id of cgroup v2 that is ancestor of the cgroup associated
2917 |  * 	with the current task at the *ancestor_level*. The root cgroup
2918 |  * 	is at *ancestor_level* zero and each step down the hierarchy
2919 |  * 	increments the level. If *ancestor_level* == level of cgroup
2920 |  * 	associated with the current task, then return value will be the
2921 |  * 	same as that of **bpf_get_current_cgroup_id**\ ().
2922 |  *
2923 |  * 	The helper is useful to implement policies based on cgroups
2924 |  * 	that are upper in hierarchy than immediate cgroup associated
2925 |  * 	with the current task.
2926 |  *
2927 |  * 	The format of returned id and helper limitations are same as in
2928 |  * 	**bpf_get_current_cgroup_id**\ ().
2929 |  *
2930 |  * Returns
2931 |  * 	The id is returned or 0 in case the id could not be retrieved.
2932 |  */
2933 | static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123;
2934 | 
2935 | /*
2936 |  * bpf_sk_assign
2937 |  *
2938 |  * 	Helper is overloaded depending on BPF program type. This
2939 |  * 	description applies to **BPF_PROG_TYPE_SCHED_CLS** and
2940 |  * 	**BPF_PROG_TYPE_SCHED_ACT** programs.
2941 |  *
2942 |  * 	Assign the *sk* to the *skb*. When combined with appropriate
2943 |  * 	routing configuration to receive the packet towards the socket,
2944 |  * 	will cause *skb* to be delivered to the specified socket.
2945 |  * 	Subsequent redirection of *skb* via  **bpf_redirect**\ (),
2946 |  * 	**bpf_clone_redirect**\ () or other methods outside of BPF may
2947 |  * 	interfere with successful delivery to the socket.
2948 |  *
2949 |  * 	This operation is only valid from TC ingress path.
2950 |  *
2951 |  * 	The *flags* argument must be zero.
2952 |  *
2953 |  * Returns
2954 |  * 	0 on success, or a negative error in case of failure:
2955 |  *
2956 |  * 	**-EINVAL** if specified *flags* are not supported.
2957 |  *
2958 |  * 	**-ENOENT** if the socket is unavailable for assignment.
2959 |  *
2960 |  * 	**-ENETUNREACH** if the socket is unreachable (wrong netns).
2961 |  *
2962 |  * 	**-EOPNOTSUPP** if the operation is not supported, for example
2963 |  * 	a call from outside of TC ingress.
2964 |  *
2965 |  * 	**-ESOCKTNOSUPPORT** if the socket type is not supported
2966 |  * 	(reuseport).
2967 |  */
2968 | static long (*bpf_sk_assign)(void *ctx, struct bpf_sock *sk, __u64 flags) = (void *) 124;
2969 | 
2970 | /*
2971 |  * bpf_ktime_get_boot_ns
2972 |  *
2973 |  * 	Return the time elapsed since system boot, in nanoseconds.
2974 |  * 	Does include the time the system was suspended.
2975 |  * 	See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
2976 |  *
2977 |  * Returns
2978 |  * 	Current *ktime*.
2979 |  */
2980 | static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125;
2981 | 
2982 | /*
2983 |  * bpf_seq_printf
2984 |  *
2985 |  * 	**bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
2986 |  * 	out the format string.
2987 |  * 	The *m* represents the seq_file. The *fmt* and *fmt_size* are for
2988 |  * 	the format string itself. The *data* and *data_len* are format string
2989 |  * 	arguments. The *data* are a **u64** array and corresponding format string
2990 |  * 	values are stored in the array. For strings and pointers where pointees
2991 |  * 	are accessed, only the pointer values are stored in the *data* array.
2992 |  * 	The *data_len* is the size of *data* in bytes.
2993 |  *
2994 |  * 	Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
2995 |  * 	Reading kernel memory may fail due to either invalid address or
2996 |  * 	valid address but requiring a major memory fault. If reading kernel memory
2997 |  * 	fails, the string for **%s** will be an empty string, and the ip
2998 |  * 	address for **%p{i,I}{4,6}** will be 0. Not returning error to
2999 |  * 	bpf program is consistent with what **bpf_trace_printk**\ () does for now.
3000 |  *
3001 |  * Returns
3002 |  * 	0 on success, or a negative error in case of failure:
3003 |  *
3004 |  * 	**-EBUSY** if per-CPU memory copy buffer is busy, can try again
3005 |  * 	by returning 1 from bpf program.
3006 |  *
3007 |  * 	**-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
3008 |  *
3009 |  * 	**-E2BIG** if *fmt* contains too many format specifiers.
3010 |  *
3011 |  * 	**-EOVERFLOW** if an overflow happened: The same object will be tried again.
3012 |  */
3013 | static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126;
3014 | 
3015 | /*
3016 |  * bpf_seq_write
3017 |  *
3018 |  * 	**bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
3019 |  * 	The *m* represents the seq_file. The *data* and *len* represent the
3020 |  * 	data to write in bytes.
3021 |  *
3022 |  * Returns
3023 |  * 	0 on success, or a negative error in case of failure:
3024 |  *
3025 |  * 	**-EOVERFLOW** if an overflow happened: The same object will be tried again.
3026 |  */
3027 | static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127;
3028 | 
3029 | /*
3030 |  * bpf_sk_cgroup_id
3031 |  *
3032 |  * 	Return the cgroup v2 id of the socket *sk*.
3033 |  *
3034 |  * 	*sk* must be a non-**NULL** pointer to a full socket, e.g. one
3035 |  * 	returned from **bpf_sk_lookup_xxx**\ (),
3036 |  * 	**bpf_sk_fullsock**\ (), etc. The format of returned id is
3037 |  * 	same as in **bpf_skb_cgroup_id**\ ().
3038 |  *
3039 |  * 	This helper is available only if the kernel was compiled with
3040 |  * 	the **CONFIG_SOCK_CGROUP_DATA** configuration option.
3041 |  *
3042 |  * Returns
3043 |  * 	The id is returned or 0 in case the id could not be retrieved.
3044 |  */
3045 | static __u64 (*bpf_sk_cgroup_id)(struct bpf_sock *sk) = (void *) 128;
3046 | 
3047 | /*
3048 |  * bpf_sk_ancestor_cgroup_id
3049 |  *
3050 |  * 	Return id of cgroup v2 that is ancestor of cgroup associated
3051 |  * 	with the *sk* at the *ancestor_level*.  The root cgroup is at
3052 |  * 	*ancestor_level* zero and each step down the hierarchy
3053 |  * 	increments the level. If *ancestor_level* == level of cgroup
3054 |  * 	associated with *sk*, then return value will be same as that
3055 |  * 	of **bpf_sk_cgroup_id**\ ().
3056 |  *
3057 |  * 	The helper is useful to implement policies based on cgroups
3058 |  * 	that are upper in hierarchy than immediate cgroup associated
3059 |  * 	with *sk*.
3060 |  *
3061 |  * 	The format of returned id and helper limitations are same as in
3062 |  * 	**bpf_sk_cgroup_id**\ ().
3063 |  *
3064 |  * Returns
3065 |  * 	The id is returned or 0 in case the id could not be retrieved.
3066 |  */
3067 | static __u64 (*bpf_sk_ancestor_cgroup_id)(struct bpf_sock *sk, int ancestor_level) = (void *) 129;
3068 | 
3069 | /*
3070 |  * bpf_ringbuf_output
3071 |  *
3072 |  * 	Copy *size* bytes from *data* into a ring buffer *ringbuf*.
3073 |  * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3074 |  * 	of new data availability is sent.
3075 |  * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3076 |  * 	of new data availability is sent unconditionally.
3077 |  *
3078 |  * Returns
3079 |  * 	0 on success, or a negative error in case of failure.
3080 |  */
3081 | static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130;
3082 | 
3083 | /*
3084 |  * bpf_ringbuf_reserve
3085 |  *
3086 |  * 	Reserve *size* bytes of payload in a ring buffer *ringbuf*.
3087 |  *
3088 |  * Returns
3089 |  * 	Valid pointer with *size* bytes of memory available; NULL,
3090 |  * 	otherwise.
3091 |  */
3092 | static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131;
3093 | 
3094 | /*
3095 |  * bpf_ringbuf_submit
3096 |  *
3097 |  * 	Submit reserved ring buffer sample, pointed to by *data*.
3098 |  * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3099 |  * 	of new data availability is sent.
3100 |  * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3101 |  * 	of new data availability is sent unconditionally.
3102 |  *
3103 |  * Returns
3104 |  * 	Nothing. Always succeeds.
3105 |  */
3106 | static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132;
3107 | 
3108 | /*
3109 |  * bpf_ringbuf_discard
3110 |  *
3111 |  * 	Discard reserved ring buffer sample, pointed to by *data*.
3112 |  * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3113 |  * 	of new data availability is sent.
3114 |  * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3115 |  * 	of new data availability is sent unconditionally.
3116 |  *
3117 |  * Returns
3118 |  * 	Nothing. Always succeeds.
3119 |  */
3120 | static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133;
3121 | 
3122 | /*
3123 |  * bpf_ringbuf_query
3124 |  *
3125 |  * 	Query various characteristics of provided ring buffer. What
3126 |  * 	exactly is queries is determined by *flags*:
3127 |  *
3128 |  * 	* **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed.
3129 |  * 	* **BPF_RB_RING_SIZE**: The size of ring buffer.
3130 |  * 	* **BPF_RB_CONS_POS**: Consumer position (can wrap around).
3131 |  * 	* **BPF_RB_PROD_POS**: Producer(s) position (can wrap around).
3132 |  *
3133 |  * 	Data returned is just a momentary snapshot of actual values
3134 |  * 	and could be inaccurate, so this facility should be used to
3135 |  * 	power heuristics and for reporting, not to make 100% correct
3136 |  * 	calculation.
3137 |  *
3138 |  * Returns
3139 |  * 	Requested value, or 0, if *flags* are not recognized.
3140 |  */
3141 | static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134;
3142 | 
3143 | /*
3144 |  * bpf_csum_level
3145 |  *
3146 |  * 	Change the skbs checksum level by one layer up or down, or
3147 |  * 	reset it entirely to none in order to have the stack perform
3148 |  * 	checksum validation. The level is applicable to the following
3149 |  * 	protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
3150 |  * 	| ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
3151 |  * 	through **bpf_skb_adjust_room**\ () helper with passing in
3152 |  * 	**BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one	call
3153 |  * 	to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
3154 |  * 	the UDP header is removed. Similarly, an encap of the latter
3155 |  * 	into the former could be accompanied by a helper call to
3156 |  * 	**bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
3157 |  * 	skb is still intended to be processed in higher layers of the
3158 |  * 	stack instead of just egressing at tc.
3159 |  *
3160 |  * 	There are three supported level settings at this time:
3161 |  *
3162 |  * 	* **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
3163 |  * 	  with CHECKSUM_UNNECESSARY.
3164 |  * 	* **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
3165 |  * 	  with CHECKSUM_UNNECESSARY.
3166 |  * 	* **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
3167 |  * 	  sets CHECKSUM_NONE to force checksum validation by the stack.
3168 |  * 	* **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
3169 |  * 	  skb->csum_level.
3170 |  *
3171 |  * Returns
3172 |  * 	0 on success, or a negative error in case of failure. In the
3173 |  * 	case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
3174 |  * 	is returned or the error code -EACCES in case the skb is not
3175 |  * 	subject to CHECKSUM_UNNECESSARY.
3176 |  */
3177 | static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135;
3178 | 
3179 | /*
3180 |  * bpf_skc_to_tcp6_sock
3181 |  *
3182 |  * 	Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
3183 |  *
3184 |  * Returns
3185 |  * 	*sk* if casting is valid, or NULL otherwise.
3186 |  */
3187 | static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136;
3188 | 
3189 | /*
3190 |  * bpf_skc_to_tcp_sock
3191 |  *
3192 |  * 	Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
3193 |  *
3194 |  * Returns
3195 |  * 	*sk* if casting is valid, or NULL otherwise.
3196 |  */
3197 | static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137;
3198 | 
3199 | /*
3200 |  * bpf_skc_to_tcp_timewait_sock
3201 |  *
3202 |  * 	Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
3203 |  *
3204 |  * Returns
3205 |  * 	*sk* if casting is valid, or NULL otherwise.
3206 |  */
3207 | static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138;
3208 | 
3209 | /*
3210 |  * bpf_skc_to_tcp_request_sock
3211 |  *
3212 |  * 	Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
3213 |  *
3214 |  * Returns
3215 |  * 	*sk* if casting is valid, or NULL otherwise.
3216 |  */
3217 | static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139;
3218 | 
3219 | /*
3220 |  * bpf_skc_to_udp6_sock
3221 |  *
3222 |  * 	Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
3223 |  *
3224 |  * Returns
3225 |  * 	*sk* if casting is valid, or NULL otherwise.
3226 |  */
3227 | static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140;
3228 | 
3229 | /*
3230 |  * bpf_get_task_stack
3231 |  *
3232 |  * 	Return a user or a kernel stack in bpf program provided buffer.
3233 |  * 	To achieve this, the helper needs *task*, which is a valid
3234 |  * 	pointer to struct task_struct. To store the stacktrace, the
3235 |  * 	bpf program provides *buf* with	a nonnegative *size*.
3236 |  *
3237 |  * 	The last argument, *flags*, holds the number of stack frames to
3238 |  * 	skip (from 0 to 255), masked with
3239 |  * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
3240 |  * 	the following flags:
3241 |  *
3242 |  * 	**BPF_F_USER_STACK**
3243 |  * 		Collect a user space stack instead of a kernel stack.
3244 |  * 	**BPF_F_USER_BUILD_ID**
3245 |  * 		Collect buildid+offset instead of ips for user stack,
3246 |  * 		only valid if **BPF_F_USER_STACK** is also specified.
3247 |  *
3248 |  * 	**bpf_get_task_stack**\ () can collect up to
3249 |  * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
3250 |  * 	to sufficient large buffer size. Note that
3251 |  * 	this limit can be controlled with the **sysctl** program, and
3252 |  * 	that it should be manually increased in order to profile long
3253 |  * 	user stacks (such as stacks for Java programs). To do so, use:
3254 |  *
3255 |  * 	::
3256 |  *
3257 |  * 		# sysctl kernel.perf_event_max_stack=<new value>
3258 |  *
3259 |  * Returns
3260 |  * 	A non-negative value equal to or less than *size* on success,
3261 |  * 	or a negative error in case of failure.
3262 |  */
3263 | static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141;
3264 | 
3265 | 
3266 | 


--------------------------------------------------------------------------------
/include/bpf/bpf_helpers.h:
--------------------------------------------------------------------------------
 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 2 | #ifndef __BPF_HELPERS__
 3 | #define __BPF_HELPERS__
 4 | 
 5 | /*
 6 |  * Note that bpf programs need to include either
 7 |  * vmlinux.h (auto-generated from BTF) or linux/types.h
 8 |  * in advance since bpf_helper_defs.h uses such types
 9 |  * as __u64.
10 |  */
11 | #include "bpf_helper_defs.h"
12 | 
13 | #define __uint(name, val) int (*name)[val]
14 | #define __type(name, val) typeof(val) *name
15 | #define __array(name, val) typeof(val) *name[]
16 | 
17 | /* Helper macro to print out debug messages */
18 | #define bpf_printk(fmt, ...)				\
19 | ({							\
20 | 	char ____fmt[] = fmt;				\
21 | 	bpf_trace_printk(____fmt, sizeof(____fmt),	\
22 | 			 ##__VA_ARGS__);		\
23 | })
24 | 
25 | /*
26 |  * Helper macro to place programs, maps, license in
27 |  * different sections in elf_bpf file. Section names
28 |  * are interpreted by elf_bpf loader
29 |  */
30 | #define SEC(NAME) __attribute__((section(NAME), used))
31 | 
32 | #ifndef __always_inline
33 | #define __always_inline __attribute__((always_inline))
34 | #endif
35 | #ifndef __weak
36 | #define __weak __attribute__((weak))
37 | #endif
38 | 
39 | /*
40 |  * Helper macro to manipulate data structures
41 |  */
42 | #ifndef offsetof
43 | #define offsetof(TYPE, MEMBER)  __builtin_offsetof(TYPE, MEMBER)
44 | #endif
45 | #ifndef container_of
46 | #define container_of(ptr, type, member)				\
47 | 	({							\
48 | 		void *__mptr = (void *)(ptr);			\
49 | 		((type *)(__mptr - offsetof(type, member)));	\
50 | 	})
51 | #endif
52 | 
53 | /*
54 |  * Helper structure used by eBPF C program
55 |  * to describe BPF map attributes to libbpf loader
56 |  */
57 | struct bpf_map_def {
58 | 	unsigned int type;
59 | 	unsigned int key_size;
60 | 	unsigned int value_size;
61 | 	unsigned int max_entries;
62 | 	unsigned int map_flags;
63 | };
64 | 
65 | enum libbpf_pin_type {
66 | 	LIBBPF_PIN_NONE,
67 | 	/* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
68 | 	LIBBPF_PIN_BY_NAME,
69 | };
70 | 
71 | enum libbpf_tristate {
72 | 	TRI_NO = 0,
73 | 	TRI_YES = 1,
74 | 	TRI_MODULE = 2,
75 | };
76 | 
77 | #define __kconfig __attribute__((section(".kconfig")))
78 | #define __ksym __attribute__((section(".ksyms")))
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/include/fasthash.h:
--------------------------------------------------------------------------------
 1 | /* The MIT License
 2 | 
 3 |    Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
 4 | 
 5 |    Permission is hereby granted, free of charge, to any person
 6 |    obtaining a copy of this software and associated documentation
 7 |    files (the "Software"), to deal in the Software without
 8 |    restriction, including without limitation the rights to use, copy,
 9 |    modify, merge, publish, distribute, sublicense, and/or sell copies
10 |    of the Software, and to permit persons to whom the Software is
11 |    furnished to do so, subject to the following conditions:
12 | 
13 |    The above copyright notice and this permission notice shall be
14 |    included in all copies or substantial portions of the Software.
15 | 
16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 |    SOFTWARE.
24 | */
25 | 
26 | #pragma once
27 | 
28 | #include <linux/types.h>
29 | #include <mindef.h>
30 | 
31 | // clang-format off
32 | 
33 | // Compression function for Merkle-Damgard construction.
34 | // This function is generated using the framework provided.
35 | static __attribute__((always_inline)) inline __u64 fasthash_mix(__u64 h) {
36 | 	h ^= h >> 23;
37 | 	h *= 0x2127599bf4325c37ULL;
38 | 	h ^= h >> 47;
39 | 	return h;
40 | }
41 | 
42 | static __attribute__((always_inline)) inline __u64 fasthash64(const void *buf, __u64 len, __u64 seed)
43 | {
44 | 	const __u64 m = 0x880355f21e6d1965ULL;
45 | 	const __u64 *pos = (const __u64 *)buf;
46 | 	const __u64 *end = pos + (len / 8);
47 | 	__u64 h = seed ^ (len * m);
48 | 	__u64 v;
49 | 
50 | #pragma clang loop unroll(full)
51 | 	while (pos != end) {
52 | 		v  = *pos++;
53 | 		h ^= fasthash_mix(v);
54 | 		h *= m;
55 | 	}
56 | 
57 | 	if (len & 7) {
58 | 		v = 0;
59 | 		__builtin_memcpy(&v, pos, len & 7);
60 | 		h ^= fasthash_mix(v);
61 | 		h *= m;
62 | 	}
63 | 
64 | 	return fasthash_mix(h);
65 | }
66 | 
67 | static __attribute__((always_inline)) inline __u32 fasthash32(const void *buf, __u64 len, __u32 seed)
68 | {
69 | 	// the following trick converts the 64-bit hashcode to Fermat
70 | 	// residue, which shall retain information from both the higher
71 | 	// and lower parts of hashcode.
72 |         __u64 h = fasthash64(buf, len, seed);
73 | 	return h - (h >> 32);
74 | }


--------------------------------------------------------------------------------
/include/in.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <linux/types.h>
 4 | 
 5 | // musl license: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
 6 | 
 7 | // from musl: include/netinet/in.h
 8 | struct in_addr {
 9 | 	__u32 s_addr;
10 | };
11 | 
12 | struct in6_addr {
13 | 	union {
14 | 		__u8 s6_addr[16];
15 | 		__u16 s6_addr16[8];
16 | 		__u32 s6_addr32[4];
17 | 	};
18 | };


--------------------------------------------------------------------------------
/include/ip.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <in.h>
 4 | #include <linux/types.h>
 5 | 
 6 | // musl license: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
 7 | 
 8 | // from musl: include/netinet/ip.h
 9 | struct iphdr {
10 | 	__u8 version_ihl;
11 | 	__u8 tos;
12 | 	__u16 tot_len;
13 | 	__u16 id;
14 | 	__u16 frag_off;
15 | 	__u8 ttl;
16 | 	__u8 protocol;
17 | 	__u16 check;
18 | 	__u32 saddr;
19 | 	__u32 daddr;
20 | };
21 | 
22 | // from musl: include/netinet/ip6.h
23 | struct ip6_hdr {
24 | 	union {
25 | 		struct ip6_hdrctl {
26 | 			__u32 ip6_un1_flow;
27 | 			__u16 ip6_un1_plen;
28 | 			__u8 ip6_un1_nxt;
29 | 			__u8 ip6_un1_hlim;
30 | 		} ip6_un1;
31 | 		__u8 ip6_un2_vfc;
32 | 	} ip6_ctlun;
33 | 	struct in6_addr ip6_src;
34 | 	struct in6_addr ip6_dst;
35 | };


--------------------------------------------------------------------------------
/include/linux/bpf_common.h:
--------------------------------------------------------------------------------
 1 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 2 | #ifndef __LINUX_BPF_COMMON_H__
 3 | #define __LINUX_BPF_COMMON_H__
 4 | 
 5 | /* Instruction classes */
 6 | #define BPF_CLASS(code) ((code) & 0x07)
 7 | #define		BPF_LD		0x00
 8 | #define		BPF_LDX		0x01
 9 | #define		BPF_ST		0x02
10 | #define		BPF_STX		0x03
11 | #define		BPF_ALU		0x04
12 | #define		BPF_JMP		0x05
13 | #define		BPF_RET		0x06
14 | #define		BPF_MISC        0x07
15 | 
16 | /* ld/ldx fields */
17 | #define BPF_SIZE(code)  ((code) & 0x18)
18 | #define		BPF_W		0x00 /* 32-bit */
19 | #define		BPF_H		0x08 /* 16-bit */
20 | #define		BPF_B		0x10 /*  8-bit */
21 | /* eBPF		BPF_DW		0x18    64-bit */
22 | #define BPF_MODE(code)  ((code) & 0xe0)
23 | #define		BPF_IMM		0x00
24 | #define		BPF_ABS		0x20
25 | #define		BPF_IND		0x40
26 | #define		BPF_MEM		0x60
27 | #define		BPF_LEN		0x80
28 | #define		BPF_MSH		0xa0
29 | 
30 | /* alu/jmp fields */
31 | #define BPF_OP(code)    ((code) & 0xf0)
32 | #define		BPF_ADD		0x00
33 | #define		BPF_SUB		0x10
34 | #define		BPF_MUL		0x20
35 | #define		BPF_DIV		0x30
36 | #define		BPF_OR		0x40
37 | #define		BPF_AND		0x50
38 | #define		BPF_LSH		0x60
39 | #define		BPF_RSH		0x70
40 | #define		BPF_NEG		0x80
41 | #define		BPF_MOD		0x90
42 | #define		BPF_XOR		0xa0
43 | 
44 | #define		BPF_JA		0x00
45 | #define		BPF_JEQ		0x10
46 | #define		BPF_JGT		0x20
47 | #define		BPF_JGE		0x30
48 | #define		BPF_JSET        0x40
49 | #define BPF_SRC(code)   ((code) & 0x08)
50 | #define		BPF_K		0x00
51 | #define		BPF_X		0x08
52 | 
53 | #ifndef BPF_MAXINSNS
54 | #define BPF_MAXINSNS 4096
55 | #endif
56 | 
57 | #endif /* __LINUX_BPF_COMMON_H__ */
58 | 


--------------------------------------------------------------------------------
/include/linux/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | typedef char __s8;
 4 | typedef short __s16;
 5 | typedef int __s32;
 6 | typedef long long __s64;
 7 | _Static_assert(sizeof(__s32) == 4, "__s32 must be 4 bytes");
 8 | _Static_assert(sizeof(__s64) == 8, "__s64 must be 8 bytes");
 9 | 
10 | typedef unsigned char __u8;
11 | typedef unsigned short __u16;
12 | typedef unsigned int __u32;
13 | typedef unsigned long long __u64;
14 | _Static_assert(sizeof(__u32) == 4, "__u32 must be 4 bytes");
15 | _Static_assert(sizeof(__u64) == 8, "__u64 must be 8 bytes");
16 | 
17 | typedef __u16 __be16;
18 | typedef __u16 __le16;
19 | typedef __u32 __be32;
20 | typedef __u32 __le32;
21 | typedef __u64 __be64;
22 | typedef __u64 __le64;
23 | 
24 | typedef __u32 __wsum;
25 | 
26 | typedef __u64 __attribute__((aligned(8))) __aligned_u64;
27 | 


--------------------------------------------------------------------------------
/include/lookup3.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | -------------------------------------------------------------------------------
  3 | This file is derived from lookup3 by Bob Jenkins. The main change is that
  4 | hashlittle assumes an aligned pointer. This is because BPF doesn't allow
  5 | inspecting pointer values.
  6 | 
  7 | lookup3.c, by Bob Jenkins, May 2006, Public Domain.
  8 | 
  9 | These are functions for producing 32-bit hashes for hash table lookup.
 10 | hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 11 | are externally useful functions.  Routines to test the hash are included
 12 | if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 13 | the public domain.  It has no warranty.
 14 | 
 15 | You probably want to use hashlittle().  hashlittle() and hashbig()
 16 | hash byte arrays.  hashlittle() is is faster than hashbig() on
 17 | little-endian machines.  Intel and AMD are little-endian machines.
 18 | On second thought, you probably want hashlittle2(), which is identical to
 19 | hashlittle() except it returns two 32-bit hashes for the price of one.
 20 | You could implement hashbig2() if you wanted but I haven't bothered here.
 21 | 
 22 | If you want to find a hash of, say, exactly 7 integers, do
 23 |   a = i1;  b = i2;  c = i3;
 24 |   mix(a,b,c);
 25 |   a += i4; b += i5; c += i6;
 26 |   mix(a,b,c);
 27 |   a += i7;
 28 |   final(a,b,c);
 29 | then use c as the hash value.  If you have a variable length array of
 30 | 4-byte integers to hash, use hashword().  If you have a byte array (like
 31 | a character string), use hashlittle().  If you have several byte arrays, or
 32 | a mix of things, see the comments above hashlittle().
 33 | 
 34 | Why is this so big?  I read 12 bytes at a time into 3 4-byte integers,
 35 | then mix those integers.  This is fast (you can do a lot more thorough
 36 | mixing with 12*3 instructions on 3 integers than you can with 3 instructions
 37 | on 1 byte), but shoehorning those bytes into integers efficiently is messy.
 38 | -------------------------------------------------------------------------------
 39 | */
 40 | 
 41 | #pragma once
 42 | 
 43 | #include <linux/types.h>
 44 | 
 45 | // clang-format off
 46 | 
 47 | #define hashsize(n) ((__u32)1 << (n))
 48 | #define hashmask(n) (hashsize(n) - 1)
 49 | #define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
 50 | 
 51 | /*
 52 | -------------------------------------------------------------------------------
 53 | mix -- mix 3 32-bit values reversibly.
 54 | 
 55 | This is reversible, so any information in (a,b,c) before mix() is
 56 | still in (a,b,c) after mix().
 57 | 
 58 | If four pairs of (a,b,c) inputs are run through mix(), or through
 59 | mix() in reverse, there are at least 32 bits of the output that
 60 | are sometimes the same for one pair and different for another pair.
 61 | This was tested for:
 62 | * pairs that differed by one bit, by two bits, in any combination
 63 |   of top bits of (a,b,c), or in any combination of bottom bits of
 64 |   (a,b,c).
 65 | * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
 66 |   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
 67 |   is commonly produced by subtraction) look like a single 1-bit
 68 |   difference.
 69 | * the base values were pseudorandom, all zero but one bit set, or
 70 |   all zero plus a counter that starts at zero.
 71 | 
 72 | Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
 73 | satisfy this are
 74 | 	4  6  8 16 19  4
 75 | 	9 15  3 18 27 15
 76 |    14  9  3  7 17  3
 77 | Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
 78 | for "differ" defined as + with a one-bit base and a two-bit delta.  I
 79 | used http://burtleburtle.net/bob/hash/avalanche.html to choose
 80 | the operations, constants, and arrangements of the variables.
 81 | 
 82 | This does not achieve avalanche.  There are input bits of (a,b,c)
 83 | that fail to affect some output bits of (a,b,c), especially of a.  The
 84 | most thoroughly mixed value is c, but it doesn't really even achieve
 85 | avalanche in c.
 86 | 
 87 | This allows some parallelism.  Read-after-writes are good at doubling
 88 | the number of bits affected, so the goal of mixing pulls in the opposite
 89 | direction as the goal of parallelism.  I did what I could.  Rotates
 90 | seem to cost as much as shifts on every machine I could lay my hands
 91 | on, and rotates are much kinder to the top and bottom bits, so I used
 92 | rotates.
 93 | -------------------------------------------------------------------------------
 94 | */
 95 | #define mix(a, b, c) \
 96 | 	{ \
 97 | 		a -= c; \
 98 | 		a ^= rot(c, 4); \
 99 | 		c += b; \
100 | 		b -= a; \
101 | 		b ^= rot(a, 6); \
102 | 		a += c; \
103 | 		c -= b; \
104 | 		c ^= rot(b, 8); \
105 | 		b += a; \
106 | 		a -= c; \
107 | 		a ^= rot(c, 16); \
108 | 		c += b; \
109 | 		b -= a; \
110 | 		b ^= rot(a, 19); \
111 | 		a += c; \
112 | 		c -= b; \
113 | 		c ^= rot(b, 4); \
114 | 		b += a; \
115 | 	}
116 | 
117 | /*
118 | -------------------------------------------------------------------------------
119 | final -- final mixing of 3 32-bit values (a,b,c) into c
120 | 
121 | Pairs of (a,b,c) values differing in only a few bits will usually
122 | produce values of c that look totally different.  This was tested for
123 | * pairs that differed by one bit, by two bits, in any combination
124 |   of top bits of (a,b,c), or in any combination of bottom bits of
125 |   (a,b,c).
126 | * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
127 |   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
128 |   is commonly produced by subtraction) look like a single 1-bit
129 |   difference.
130 | * the base values were pseudorandom, all zero but one bit set, or
131 |   all zero plus a counter that starts at zero.
132 | 
133 | These constants passed:
134 |  14 11 25 16 4 14 24
135 |  12 14 25 16 4 14 24
136 | and these came close:
137 |   4  8 15 26 3 22 24
138 |  10  8 15 26 3 22 24
139 |  11  8 15 26 3 22 24
140 | -------------------------------------------------------------------------------
141 | */
142 | #define final(a, b, c) \
143 | 	{ \
144 | 		c ^= b; \
145 | 		c -= rot(b, 14); \
146 | 		a ^= c; \
147 | 		a -= rot(c, 11); \
148 | 		b ^= a; \
149 | 		b -= rot(a, 25); \
150 | 		c ^= b; \
151 | 		c -= rot(b, 16); \
152 | 		a ^= c; \
153 | 		a -= rot(c, 4); \
154 | 		b ^= a; \
155 | 		b -= rot(a, 14); \
156 | 		c ^= b; \
157 | 		c -= rot(b, 24); \
158 | 	}
159 | 
160 | static __attribute__((always_inline)) __u32 hashlittle(const void *key, __u64 length, __u32 initval)
161 | {
162 | 	__u32 a, b, c;                       /* internal state */
163 | 	const __u32 *k = (const __u32 *)key; /* read 32-bit chunks */
164 | 	const __u32 *end = k + (length / 12) * 3;
165 | 	const __u8 *k8;
166 | 
167 | 	/* Set up the internal state */
168 | 	a = b = c = 0xdeadbeef + ((__u32)length) + initval;
169 | 
170 | 	/*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
171 | #pragma clang loop unroll(full)
172 | 	while (k != end) {
173 | 		a += k[0];
174 | 		b += k[1];
175 | 		c += k[2];
176 | 		mix(a, b, c);
177 | 		k += 3;
178 | 	}
179 | 
180 | 	/*----------------------------- handle the last (probably partial) block */
181 | 	k8 = (const __u8 *)k;
182 | 	switch (length % 12) {
183 | 	case 12:
184 | 		c += k[2];
185 | 		b += k[1];
186 | 		a += k[0];
187 | 		break;
188 | 	case 11:
189 | 		c += ((__u32)k8[10]) << 16; /* fall through */
190 | 	case 10:
191 | 		c += ((__u32)k8[9]) << 8; /* fall through */
192 | 	case 9:
193 | 		c += k8[8]; /* fall through */
194 | 	case 8:
195 | 		b += k[1];
196 | 		a += k[0];
197 | 		break;
198 | 	case 7:
199 | 		b += ((__u32)k8[6]) << 16; /* fall through */
200 | 	case 6:
201 | 		b += ((__u32)k8[5]) << 8; /* fall through */
202 | 	case 5:
203 | 		b += k8[4]; /* fall through */
204 | 	case 4:
205 | 		a += k[0];
206 | 		break;
207 | 	case 3:
208 | 		a += ((__u32)k8[2]) << 16; /* fall through */
209 | 	case 2:
210 | 		a += ((__u32)k8[1]) << 8; /* fall through */
211 | 	case 1:
212 | 		a += k8[0];
213 | 		break;
214 | 	case 0:
215 | 		return c;
216 | 	}
217 | 
218 | 	final(a, b, c);
219 | 	return c;
220 | }
221 | 
222 | #undef hashsize
223 | #undef hashmask
224 | #undef rot
225 | #undef mix
226 | #undef final
227 | 


--------------------------------------------------------------------------------
/include/mindef.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifndef NULL
 4 | #define NULL ((void *)0)
 5 | #endif
 6 | 
 7 | #ifndef offsetof
 8 | #define offsetof(type, member) __builtin_offsetof(type, member)
 9 | #endif
10 | 
11 | #ifndef offsetofend
12 | #define offsetofend(type, member) (offsetof(type, member) + sizeof((((type *)0)->member)))
13 | #endif
14 | 


--------------------------------------------------------------------------------
/include/stdbool.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define bool _Bool
4 | 
5 | #define true 1
6 | #define false 0
7 | 


--------------------------------------------------------------------------------
/rakelimit.go:
--------------------------------------------------------------------------------
  1 | package rakelimit
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"syscall"
  7 | 
  8 | 	"github.com/cilium/ebpf"
  9 | 	"github.com/cilium/ebpf/asm"
 10 | 	"golang.org/x/sys/unix"
 11 | )
 12 | 
 13 | //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang-12 rake ./src/rakelimit.c -- -I./include -nostdinc -Os
 14 | 
 15 | // Rakelimit holds an instance of a ratelimiter that can be applied on a socket
 16 | type Rakelimit struct {
 17 | 	domain     int
 18 | 	program    *ebpf.Program
 19 | 	bpfObjects *rakeObjects
 20 | }
 21 | 
 22 | // New creates a new Rakelimit instance based on the specified ppsLimit
 23 | func New(conn syscall.Conn, ppsLimit uint32) (*Rakelimit, error) {
 24 | 	// set ratelimit
 25 | 	spec, err := loadRake()
 26 | 	if err != nil {
 27 | 		return nil, fmt.Errorf("get elf spec: %v", err)
 28 | 	}
 29 | 
 30 | 	if err := rewriteConstant(spec, "LIMIT", uint64(ppsLimit)); err != nil {
 31 | 		return nil, err
 32 | 	}
 33 | 
 34 | 	var objs rakeObjects
 35 | 	if err := spec.LoadAndAssign(&objs, nil); err != nil {
 36 | 		return nil, fmt.Errorf("load BPF: %v", err)
 37 | 	}
 38 | 
 39 | 	raw, err := conn.SyscallConn()
 40 | 	if err != nil {
 41 | 		return nil, fmt.Errorf("raw conn: %s", err)
 42 | 	}
 43 | 
 44 | 	var opErr error
 45 | 	var domain int
 46 | 	var prog *ebpf.Program
 47 | 	if err := raw.Control(func(s uintptr) {
 48 | 		domain, opErr = unix.GetsockoptInt(int(s), unix.SOL_SOCKET, unix.SO_DOMAIN)
 49 | 		if opErr != nil {
 50 | 			opErr = fmt.Errorf("can't retrieve domain: %s", opErr)
 51 | 			return
 52 | 		}
 53 | 
 54 | 		switch domain {
 55 | 		case unix.AF_INET:
 56 | 			prog = objs.FilterIpv4
 57 | 		case unix.AF_INET6:
 58 | 			prog = objs.FilterIpv6
 59 | 		default:
 60 | 			opErr = fmt.Errorf("unsupported socket domain: %d", domain)
 61 | 			return
 62 | 		}
 63 | 
 64 | 		opErr = unix.SetsockoptInt(int(s), unix.SOL_SOCKET, unix.SO_ATTACH_BPF, prog.FD())
 65 | 		if errors.Is(opErr, unix.ENOMEM) {
 66 | 			opErr = fmt.Errorf("attach filter: net.core.optmem_max might be too low: %s", opErr)
 67 | 			return
 68 | 		}
 69 | 		if opErr != nil {
 70 | 			opErr = fmt.Errorf("attach filter: %s", opErr)
 71 | 		}
 72 | 	}); err != nil {
 73 | 		return nil, fmt.Errorf("can't access fd: %s", err)
 74 | 	}
 75 | 	if opErr != nil {
 76 | 		return nil, opErr
 77 | 	}
 78 | 
 79 | 	return &Rakelimit{domain, prog, &objs}, nil
 80 | }
 81 | 
 82 | // Close cleans up resources occupied and should be called when finished using the structure
 83 | func (rl *Rakelimit) Close() error {
 84 | 	return rl.bpfObjects.Close()
 85 | }
 86 | 
 87 | func rewriteConstant(spec *ebpf.CollectionSpec, symbol string, value uint64) error {
 88 | 	rewritten := false
 89 | 	for name, prog := range spec.Programs {
 90 | 		for i := range prog.Instructions {
 91 | 			ins := &prog.Instructions[i]
 92 | 			if ins.Reference != symbol {
 93 | 				continue
 94 | 			}
 95 | 
 96 | 			if !ins.IsConstantLoad(asm.DWord) {
 97 | 				return fmt.Errorf("program %s: instruction %d: not a dword-sized constant load: %s", name, i, ins)
 98 | 			}
 99 | 
100 | 			ins.Constant = int64(value)
101 | 			rewritten = true
102 | 		}
103 | 	}
104 | 
105 | 	if !rewritten {
106 | 		return fmt.Errorf("symbol %s is not referenced", symbol)
107 | 	}
108 | 
109 | 	return nil
110 | }
111 | 


--------------------------------------------------------------------------------
/rakelimit_test.go:
--------------------------------------------------------------------------------
  1 | package rakelimit
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"net"
  6 | 	"testing"
  7 | 	"time"
  8 | 
  9 | 	"github.com/cilium/ebpf"
 10 | 	"github.com/google/gopacket"
 11 | 	"github.com/google/gopacket/layers"
 12 | 	"golang.org/x/sys/unix"
 13 | )
 14 | 
 15 | func TestLoad(t *testing.T) {
 16 | 	spec, err := loadRake()
 17 | 	if err != nil {
 18 | 		t.Fatal(err)
 19 | 	}
 20 | 
 21 | 	if err := rewriteConstant(spec, "LIMIT", uint64(100)); err != nil {
 22 | 		t.Fatal(err)
 23 | 	}
 24 | 
 25 | 	t.Run("IPv4", func(t *testing.T) {
 26 | 		var objs struct {
 27 | 			Prog *ebpf.Program `ebpf:"filter_ipv4"`
 28 | 		}
 29 | 		if err := spec.LoadAndAssign(&objs, nil); err != nil {
 30 | 			t.Error(err)
 31 | 		}
 32 | 	})
 33 | 	t.Run("IPv6", func(t *testing.T) {
 34 | 		var objs struct {
 35 | 			Prog *ebpf.Program `ebpf:"filter_ipv6"`
 36 | 		}
 37 | 		if err := spec.LoadAndAssign(&objs, nil); err != nil {
 38 | 			t.Error(err)
 39 | 		}
 40 | 	})
 41 | }
 42 | 
 43 | const floatBits = 32
 44 | 
 45 | type FixedPointTuple struct {
 46 | 	k, v uint64
 47 | }
 48 | 
 49 | /* TestBPFFloatToFixedPoint tests the convesion of integers/floats to fixed-point on the
 50 | userspace & the bpf side to ensure both convert it in the same way */
 51 | func TestBPFFloatToFixedPoint(t *testing.T) {
 52 | 	var objs rakeObjects
 53 | 	if err := loadRakeObjects(&objs, nil); err != nil {
 54 | 		t.Fatal("Can't load program", err)
 55 | 
 56 | 	}
 57 | 	defer objs.Close()
 58 | 
 59 | 	prog := objs.TestFpCmp
 60 | 	lookupTable := objs.TestSingleResult
 61 | 	payload := make([]byte, 14)
 62 | 
 63 | 	// check 27
 64 | 	if err := lookupTable.Put(uint32(0), floatToFixed(27.0)); err != nil {
 65 | 		t.Fatal(err)
 66 | 	}
 67 | 
 68 | 	res, _, err := prog.Test(payload)
 69 | 	if err != nil {
 70 | 		t.Fatal(err)
 71 | 	}
 72 | 	if res != 0 {
 73 | 		t.Fatalf("Error on line %d", res)
 74 | 	}
 75 | 
 76 | 	var fp uint64
 77 | 	if err := lookupTable.Lookup(uint32(0), &fp); err != nil {
 78 | 		t.Fatal(err)
 79 | 	}
 80 | 
 81 | 	// check if bpf to go works
 82 | 	fl := fixedToFloat(fp)
 83 | 	if fl != 19 {
 84 | 		t.Fatal("Expected 19, got", fl)
 85 | 	}
 86 | }
 87 | 
 88 | func TestBPFFEwma(t *testing.T) {
 89 | 	const (
 90 | 		rateKey uint32 = iota
 91 | 		oldTSKey
 92 | 		newTSKey
 93 | 	)
 94 | 
 95 | 	var objs rakeObjects
 96 | 	if err := loadRakeObjects(&objs, nil); err != nil {
 97 | 		t.Fatal("Can't load program", err)
 98 | 
 99 | 	}
100 | 	defer objs.Close()
101 | 
102 | 	prog := objs.TestEwma
103 | 	sr := objs.TestSingleResult
104 | 
105 | 	sr.Put(rateKey, uint64(50))
106 | 	sr.Put(oldTSKey, uint64(346534651))
107 | 	sr.Put(newTSKey, uint64(415841581))
108 | 
109 | 	ret, _, err := prog.Test(make([]byte, 14))
110 | 	if err != nil {
111 | 		t.Fatal(err)
112 | 	}
113 | 	if ret == 0 {
114 | 		t.Fatal("Unexpected return from BPF program")
115 | 	}
116 | 
117 | 	var result uint64
118 | 	if err := sr.Lookup(rateKey, &result); err != nil {
119 | 		t.Fatal(err)
120 | 	}
121 | 
122 | 	if result != 31 {
123 | 		t.Error("Expected 31, got", result)
124 | 	}
125 | }
126 | 
127 | func BenchmarkRakelimit(b *testing.B) {
128 | 	b.Run("IPv4", func(b *testing.B) {
129 | 		rake := mustNew(b, "127.0.0.1:0", math.MaxUint32)
130 | 
131 | 		packet := mustSerializeLayers(b,
132 | 			&layers.Ethernet{
133 | 				SrcMAC:       []byte{1, 2, 3, 4, 5, 6},
134 | 				DstMAC:       []byte{6, 5, 4, 3, 2, 1},
135 | 				EthernetType: layers.EthernetTypeIPv4,
136 | 			},
137 | 			&layers.IPv4{
138 | 				Version:  4,
139 | 				SrcIP:    net.IPv4(192, 0, 2, 0),
140 | 				DstIP:    net.IPv4(192, 0, 2, 123),
141 | 				Protocol: layers.IPProtocolUDP,
142 | 			},
143 | 			&layers.UDP{
144 | 				SrcPort: layers.UDPPort(12345),
145 | 				DstPort: layers.UDPPort(443),
146 | 			},
147 | 			gopacket.Payload([]byte{1, 2, 3, 4}),
148 | 		)
149 | 		b.ResetTimer()
150 | 
151 | 		lastRet, duration, err := rake.program.Benchmark(packet, b.N, b.ResetTimer)
152 | 		if err != nil {
153 | 			b.Fatal(err)
154 | 		}
155 | 
156 | 		if lastRet == 0 {
157 | 			b.Error("Packet was dropped")
158 | 		}
159 | 
160 | 		b.ReportMetric(float64(duration/time.Nanosecond), "ns/op")
161 | 	})
162 | 
163 | 	b.Run("IPv6", func(b *testing.B) {
164 | 		rake := mustNew(b, "[::1]:0", math.MaxUint32)
165 | 
166 | 		packet := mustSerializeLayers(b,
167 | 			&layers.Ethernet{
168 | 				SrcMAC:       []byte{1, 2, 3, 4, 5, 6},
169 | 				DstMAC:       []byte{6, 5, 4, 3, 2, 1},
170 | 				EthernetType: layers.EthernetTypeIPv6,
171 | 			},
172 | 			&layers.IPv6{
173 | 				Version:    6,
174 | 				SrcIP:      net.ParseIP("fd::1"),
175 | 				DstIP:      net.ParseIP("fc::1337"),
176 | 				NextHeader: layers.IPProtocolUDP,
177 | 			},
178 | 			&layers.UDP{
179 | 				SrcPort: layers.UDPPort(12345),
180 | 				DstPort: layers.UDPPort(443),
181 | 			},
182 | 			gopacket.Payload([]byte{1, 2, 3, 4}),
183 | 		)
184 | 		b.ResetTimer()
185 | 
186 | 		lastRet, duration, err := rake.program.Benchmark(packet, b.N, b.ResetTimer)
187 | 		if err != nil {
188 | 			b.Fatal(err)
189 | 		}
190 | 
191 | 		if lastRet == 0 {
192 | 			b.Error("Packet was dropped")
193 | 		}
194 | 
195 | 		b.ReportMetric(float64(duration/time.Nanosecond), "ns/op")
196 | 	})
197 | }
198 | 
199 | func mustSerializeLayers(tb testing.TB, layers ...gopacket.SerializableLayer) []byte {
200 | 	tb.Helper()
201 | 
202 | 	buf := gopacket.NewSerializeBuffer()
203 | 	opts := gopacket.SerializeOptions{
204 | 		FixLengths: true,
205 | 	}
206 | 	err := gopacket.SerializeLayers(buf, opts, layers...)
207 | 	if err != nil {
208 | 		tb.Fatal("Can't serialize layers:", err)
209 | 	}
210 | 
211 | 	return buf.Bytes()
212 | }
213 | 
214 | type testRakelimit struct {
215 | 	*Rakelimit
216 | 	testProgram *ebpf.Program
217 | 	args        *ebpf.Map
218 | 	conn        *net.UDPConn
219 | }
220 | 
221 | const (
222 | 	timeArgKey uint32 = iota
223 | 	randArgKey
224 | 	rateExceededOnLevelKey
225 | )
226 | 
227 | func mustNew(tb testing.TB, addr string, limit uint32) *testRakelimit {
228 | 	tb.Helper()
229 | 
230 | 	conn, err := net.ListenPacket("udp", addr)
231 | 	if err != nil {
232 | 		tb.Fatal("Can't listen:", err)
233 | 	}
234 | 	tb.Cleanup(func() { conn.Close() })
235 | 
236 | 	udp := conn.(*net.UDPConn)
237 | 	rake, err := New(udp, limit)
238 | 	if err != nil {
239 | 		tb.Fatal("Can't create limiter:", err)
240 | 	}
241 | 	tb.Cleanup(func() { rake.Close() })
242 | 
243 | 	prog := rake.bpfObjects.TestIpv4
244 | 	if rake.domain == unix.AF_INET6 {
245 | 		prog = rake.bpfObjects.TestIpv6
246 | 	}
247 | 
248 | 	args := rake.bpfObjects.TestSingleResult
249 | 	if err := args.Put(randArgKey, uint64(math.MaxUint32+1)); err != nil {
250 | 		tb.Fatal("Can't update rand:", err)
251 | 	}
252 | 
253 | 	return &testRakelimit{rake, prog, args, udp}
254 | }
255 | 
256 | func (trl *testRakelimit) updateTime(tb testing.TB, now uint64) {
257 | 	tb.Helper()
258 | 
259 | 	if now < math.MaxUint64 {
260 | 		// Make sure we never use a zero time, since the ewma code
261 | 		// assumes that zero means uninitialised.
262 | 		now++
263 | 	}
264 | 
265 | 	if err := trl.args.Put(timeArgKey, now); err != nil {
266 | 		tb.Error("Can't update time:", err)
267 | 	}
268 | }
269 | 
270 | func (trl *testRakelimit) updateRand(tb testing.TB, value uint32) {
271 | 	tb.Helper()
272 | 
273 | 	if err := trl.args.Put(randArgKey, uint64(value)); err != nil {
274 | 		tb.Error("Can't update rand:", err)
275 | 	}
276 | }
277 | 
278 | func (trl *testRakelimit) rateExceededOnLevel(tb testing.TB) uint32 {
279 | 	tb.Helper()
280 | 
281 | 	var level uint64
282 | 	if err := trl.args.Lookup(rateExceededOnLevelKey, &level); err != nil {
283 | 		tb.Fatal("Can't lookup drop level:", err)
284 | 	}
285 | 
286 | 	return uint32(level)
287 | }
288 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #define FORCE_INLINE inline __attribute__((__always_inline__))
 2 | 
 3 | /* from linux/socket.h */
 4 | #define AF_INET 2   /* Internet IP Protocol 	*/
 5 | #define AF_INET6 10 /* IP version 6			*/
 6 | /***********************/
 7 | 
 8 | /* from linux/filter.h */
 9 | #define BPF_NET_OFF (-0x100000)
10 | #define BPF_LL_OFF (-0x200000)
11 | /***********************/
12 | 
13 | /* Accept - allow any number of bytes */
14 | #define SKB_PASS -1
15 | /* Drop, cut packet to zero bytes */
16 | #define SKB_REJECT 0
17 | 
18 | #define ETH_P_IP 0x0800
19 | #define ETH_P_IPV6 0x86DD
20 | 
21 | unsigned long long load_byte(void *skb, unsigned long long off) asm("llvm.bpf.load.byte");
22 | unsigned long long load_half(void *skb, unsigned long long off) asm("llvm.bpf.load.half");
23 | unsigned long long load_word(void *skb, unsigned long long off) asm("llvm.bpf.load.word");
24 | 
25 | #define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
26 | 


--------------------------------------------------------------------------------
/src/countmin.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <linux/bpf.h>
 3 | #include <linux/types.h>
 4 | #include <mindef.h>
 5 | 
 6 | #include "common.h"
 7 | #include "ewma.h"
 8 | #include "fasthash.h"
 9 | #include "fixed-point.h"
10 | #include "lookup3.h"
11 | 
12 | // countmin sketch paper: http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
13 | //
14 | // A cm sketch can be thought of as a two dimensional array width d rows and
15 | // w columns. Each row uses a distinct hash function to index into its columns.
16 | //
17 | // The paper shows the following error bounds for the estimation, provided we
18 | // choose d = ceil(ln(1/gamma)) and w = ceil(e/E) (see page 7).
19 | //
20 | //     a  <= a'
21 | //     a' <= E * ||a||          with probability at least (1 - gamma)
22 | //     a    : the true answer
23 | //     a'   : the estimate made by the cm sketch
24 | //     E    : a chosen error bound
25 | //     gamma: desired probability of the upper bound
26 | //     ||a||: the sum of all previous observations (I think)
27 | //
28 | // We always choose w to be a power of two to be able to cheaply index into the cm
29 | // sketch based on a hash value. For d = 2 and w = 512 we get gamma ~0.14 and E ~0.005.
30 | //
31 | //     a <= a' <= ~0.005 * ||a|| (with probability ~0.86)
32 | //
33 | // Using 3 instead of 2 hash functions would increase the probability to 0.96. For
34 | // that we need another function however.
35 | 
36 | #define HASHFN_N 2
37 | #define COLUMNS 512
38 | 
39 | _Static_assert((COLUMNS & (COLUMNS - 1)) == 0, "COLUMNS must be a power of two");
40 | 
41 | struct cm_value {
42 | 	__u32 value;
43 | 	__u64 ts;
44 | };
45 | 
46 | struct cm_hash {
47 | 	__u32 values[HASHFN_N];
48 | };
49 | 
50 | struct countmin {
51 | 	struct cm_value values[HASHFN_N][COLUMNS];
52 | };
53 | 
54 | // add element and determine count
55 | static __u32 FORCE_INLINE cm_add_and_query(struct countmin *cm, __u64 now, const struct cm_hash *h)
56 | {
57 | 	__u32 min = -1;
58 | #pragma clang loop unroll(full)
59 | 	for (int i = 0; i < ARRAY_SIZE(cm->values); i++) {
60 | 		__u32 target_idx       = h->values[i] & (ARRAY_SIZE(cm->values[i]) - 1);
61 | 		struct cm_value *value = &cm->values[i][target_idx];
62 | 		value->value           = estimate_rate(value->value, value->ts, now);
63 | 		value->ts              = now;
64 | 		if (value->value < min) {
65 | 			min = value->value;
66 | 		}
67 | 	}
68 | 	return min;
69 | }
70 | 


--------------------------------------------------------------------------------
/src/ewma.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <linux/types.h>
 4 | 
 5 | #include "common.h"
 6 | #include "fixed-point.h"
 7 | 
 8 | // estimate_avg_rate takes a previous rate and a duration that elapsed
 9 | // since this rate has been determined, and estimates based on these and
10 | // WINDOW the current rate in packets per second.
11 | static __u32 FORCE_INLINE estimate_rate(__u32 old_rate, __u64 old_ts, __u64 now)
12 | {
13 | 	// The window after which old observations are discarded.
14 | 	// Chosen to be a power of two so that division can be done
15 | 	// with a bit shift.
16 | 	const __u32 WINDOW_NS     = 1ull << 27;
17 | 	const __u32 ONE_SECOND_NS = 1000000000ull;
18 | 
19 | 	if (old_ts >= now) {
20 | 		// Time went backward or stood still due to clockskew. Return the old value,
21 | 		// since we can't compute the current rate.
22 | 		return old_rate;
23 | 	}
24 | 
25 | 	__s64 elapsed = now - old_ts;
26 | 	if (old_ts == 0 || elapsed >= WINDOW_NS) {
27 | 		// Either there is no previous measurement, or it's too old.
28 | 		// We need another sample to calculate a reliable rate.
29 | 		return 0;
30 | 	}
31 | 
32 | 	__u32 rate_current = ONE_SECOND_NS / (__u32)elapsed;
33 | 	if (old_rate == 0) {
34 | 		// This is the first time we can calculate a rate, so use that
35 | 		// to initialize our estimate.
36 | 		return rate_current;
37 | 	}
38 | 
39 | 	const fpoint one = to_fixed_point(1, 0);
40 | 	fpoint a         = div_by_int(to_fixed_point(elapsed, 0), WINDOW_NS);
41 | 
42 | 	return to_int(a * rate_current + (one - a) * old_rate);
43 | }


--------------------------------------------------------------------------------
/src/fixed-point.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <bpf/bpf_helpers.h>
 4 | #include <linux/bpf.h>
 5 | #include <linux/types.h>
 6 | 
 7 | #include "common.h"
 8 | 
 9 | #define FRACTION_BITS 32
10 | 
11 | typedef __u64 fpoint;
12 | 
13 | static __u64 FORCE_INLINE to_fixed_point(__u32 integer, __u32 fraction)
14 | {
15 | 	return (((__u64)integer) << FRACTION_BITS) | (__u64)fraction;
16 | }
17 | 
18 | static __u32 FORCE_INLINE to_int(fpoint a)
19 | {
20 | 	return a >> FRACTION_BITS;
21 | }
22 | 
23 | static fpoint FORCE_INLINE div_by_int(fpoint dividend, __u32 divisor)
24 | {
25 | 	return dividend / divisor;
26 | }
27 | 


--------------------------------------------------------------------------------
/src/rakelimit.c:
--------------------------------------------------------------------------------
  1 | #include <in.h>
  2 | #include <ip.h>
  3 | #include <linux/bpf.h>
  4 | #include <linux/types.h>
  5 | #include <mindef.h>
  6 | #include <stdbool.h>
  7 | 
  8 | #include <bpf/bpf_endian.h>
  9 | #include <bpf/bpf_helpers.h>
 10 | 
 11 | #include "common.h"
 12 | #include "countmin.h"
 13 | #include "fasthash.h"
 14 | 
 15 | #define FH_SEED (0x2d31e867)
 16 | #define L3_SEED (0x6ad611c3)
 17 | 
 18 | #define PARAMETER(type, name) \
 19 | 	({ \
 20 | 		type __tmp; \
 21 | 		_Static_assert(sizeof(__tmp) <= sizeof(__u64), name " exceeds 64 bits"); \
 22 | 		asm("%0 = " name " ll" : "=r"(__tmp)); \
 23 | 		__tmp; \
 24 | 	})
 25 | 
 26 | enum address_gen {
 27 | 	ADDRESS_IP       = 0, // /32 or /128
 28 | 	ADDRESS_NET      = 1, // /24 or /48
 29 | 	ADDRESS_WILDCARD = 2, // /0
 30 | };
 31 | 
 32 | enum port_gen {
 33 | 	PORT_SPECIFIED = 0,
 34 | 	PORT_WILDCARD  = 1,
 35 | };
 36 | 
 37 | struct gen {
 38 | 	int level;
 39 | 	enum address_gen source;
 40 | 	enum port_gen source_port;
 41 | 	enum address_gen dest;
 42 | 	enum port_gen dest_port;
 43 | 	bool evaluate;
 44 | };
 45 | 
 46 | struct address_hash {
 47 | 	__u64 vals[ADDRESS_WILDCARD];
 48 | };
 49 | 
 50 | struct hash {
 51 | 	struct address_hash src;
 52 | 	struct address_hash dst;
 53 | 	__u64 src_port;
 54 | 	__u64 dst_port;
 55 | };
 56 | 
 57 | static const struct gen generalisations[] = {
 58 | 	/*level 0*/
 59 | 	{0, ADDRESS_IP, PORT_SPECIFIED, ADDRESS_IP, PORT_SPECIFIED, true},
 60 | 
 61 | 	/* level 1 */
 62 | 	{1, ADDRESS_NET, PORT_SPECIFIED, ADDRESS_IP, PORT_SPECIFIED, false},
 63 | 	{1, ADDRESS_IP, PORT_WILDCARD, ADDRESS_IP, PORT_SPECIFIED, false},
 64 | 	{1, ADDRESS_IP, PORT_SPECIFIED, ADDRESS_IP, PORT_WILDCARD, true},
 65 | 
 66 | 	/* level 2 */
 67 | 	/* *.*.*.*:i --> w.x.y.z:j */
 68 | 	{2, ADDRESS_WILDCARD, PORT_SPECIFIED, ADDRESS_IP, PORT_SPECIFIED, false},
 69 | 	/* a.b.c.*:* --> w.x.y.z:j */
 70 | 	{2, ADDRESS_NET, PORT_WILDCARD, ADDRESS_IP, PORT_SPECIFIED, false},
 71 | 	/* a.b.c.*:i --> w.x.y.z:* */
 72 | 	{2, ADDRESS_NET, PORT_SPECIFIED, ADDRESS_IP, PORT_WILDCARD, false},
 73 | 	/* a.b.c.d:* --> w.x.y.z:* */
 74 | 	{2, ADDRESS_IP, PORT_WILDCARD, ADDRESS_IP, PORT_WILDCARD, true},
 75 | 
 76 | 	/* level 3 */
 77 | 	/* *.*.*.*:* --> w.x.y.z:j */
 78 | 	{3, ADDRESS_WILDCARD, PORT_WILDCARD, ADDRESS_IP, PORT_SPECIFIED, false},
 79 | 	/* *.*.*.*:i --> w.x.y.z:* */
 80 | 	{3, ADDRESS_WILDCARD, PORT_SPECIFIED, ADDRESS_IP, PORT_WILDCARD, false},
 81 | 	/* A.B.C.*:* --> w.x.y.z:* */
 82 | 	{3, ADDRESS_NET, PORT_WILDCARD, ADDRESS_IP, PORT_WILDCARD, true},
 83 | 
 84 | 	/* level 4 */
 85 | 	{4, ADDRESS_WILDCARD, PORT_WILDCARD, ADDRESS_IP, PORT_WILDCARD, true},
 86 | };
 87 | 
 88 | // collect number of packet drops per level
 89 | struct bpf_map_def SEC("maps") stats = {
 90 | 	.type        = BPF_MAP_TYPE_ARRAY,
 91 | 	.key_size    = sizeof(__u32),
 92 | 	.value_size  = sizeof(__u64),
 93 | 	.max_entries = 5, // 5 levels
 94 | };
 95 | 
 96 | struct bpf_map_def SEC("maps") countmin = {
 97 | 	.type        = BPF_MAP_TYPE_ARRAY,
 98 | 	.key_size    = sizeof(__u32),
 99 | 	.value_size  = sizeof(struct countmin),
100 | 	.max_entries = ARRAY_SIZE(generalisations),
101 | };
102 | 
103 | static FORCE_INLINE void ipv6_hash(const struct in6_addr *ip, struct address_hash *a, struct address_hash *b)
104 | {
105 | 	a->vals[ADDRESS_IP]  = fasthash64(ip, sizeof(*ip), FH_SEED);
106 | 	b->vals[ADDRESS_IP]  = hashlittle(ip, sizeof(*ip), L3_SEED);
107 | 	a->vals[ADDRESS_NET] = fasthash64(ip, 48 / 8, FH_SEED);
108 | 	b->vals[ADDRESS_NET] = hashlittle(ip, 48 / 8, L3_SEED);
109 | }
110 | 
111 | static FORCE_INLINE void ipv4_hash(struct in_addr ip, struct address_hash *a, struct address_hash *b)
112 | {
113 | 	a->vals[ADDRESS_IP] = fasthash64(&ip, sizeof(ip), FH_SEED);
114 | 	b->vals[ADDRESS_IP] = hashlittle(&ip, sizeof(ip), L3_SEED);
115 | 	ip.s_addr &= 0xffffff00;
116 | 	a->vals[ADDRESS_NET] = fasthash64(&ip, sizeof(ip), FH_SEED);
117 | 	b->vals[ADDRESS_NET] = hashlittle(&ip, sizeof(ip), L3_SEED);
118 | }
119 | 
120 | static FORCE_INLINE __u64 hash_mix(__u64 a, __u64 b)
121 | {
122 | 	// Adapted from https://stackoverflow.com/a/27952689. The constant below
123 | 	// is derived from the golden ratio.
124 | 	a ^= b + 0x9e3779b97f4a7c15 + (a << 6) + (a >> 2);
125 | 	return a;
126 | }
127 | 
128 | static FORCE_INLINE __u32 gen_hash(const struct gen *gen, const struct hash *ph)
129 | {
130 | 	__u64 tmp = 0;
131 | 
132 | 	if (gen->source != ADDRESS_WILDCARD) {
133 | 		tmp = hash_mix(tmp, ph->src.vals[gen->source]);
134 | 	}
135 | 
136 | 	if (gen->dest != ADDRESS_WILDCARD) {
137 | 		tmp = hash_mix(tmp, ph->dst.vals[gen->dest]);
138 | 	}
139 | 
140 | 	if (gen->source_port != PORT_WILDCARD) {
141 | 		tmp = hash_mix(tmp, ph->src_port);
142 | 	}
143 | 
144 | 	if (gen->dest_port != PORT_WILDCARD) {
145 | 		tmp = hash_mix(tmp, ph->dst_port);
146 | 	}
147 | 
148 | 	// Adapted from fasthash32
149 | 	return tmp - (tmp >> 32);
150 | }
151 | 
152 | static __u32 FORCE_INLINE add_to_node(__u32 node_idx, __u64 ts, const struct cm_hash *h)
153 | {
154 | 	struct countmin *node = bpf_map_lookup_elem(&countmin, &node_idx);
155 | 	if (node == NULL) {
156 | 		return -1;
157 | 	}
158 | 	return cm_add_and_query(node, ts, h);
159 | }
160 | 
161 | static FORCE_INLINE void log_level_drop(__u32 level)
162 | {
163 | 	__u64 *count = bpf_map_lookup_elem(&stats, &level);
164 | 	if (count == NULL) {
165 | 		return;
166 | 	}
167 | 	(*count)++;
168 | }
169 | 
170 | static FORCE_INLINE __u64 transport_offset_ipv4(struct __sk_buff *skb)
171 | {
172 | 	__u8 version_ihl = load_byte(skb, offsetof(struct iphdr, version_ihl));
173 | 	return (version_ihl & 0xf) * sizeof(__u32);
174 | }
175 | 
176 | static FORCE_INLINE __u64 transport_offset_ipv6(struct __sk_buff *skb)
177 | {
178 | 	// TODO: Check nexthdr to make sure it's UDP.
179 | 	return sizeof(struct ip6_hdr);
180 | }
181 | 
182 | static FORCE_INLINE int load_ipv6(struct in6_addr *ip, struct __sk_buff *skb, __u64 off)
183 | {
184 | 	return bpf_skb_load_bytes(skb, off, ip, sizeof(*ip));
185 | }
186 | 
187 | static FORCE_INLINE int drop_or_accept(__u32 level, fpoint limit, __u32 max_rate, __u32 rand)
188 | {
189 | 	if (div_by_int(to_fixed_point(limit, 0), max_rate) < to_fixed_point(0, rand)) {
190 | 		log_level_drop(level);
191 | 		return SKB_REJECT;
192 | 	}
193 | 	return SKB_PASS;
194 | }
195 | 
196 | static FORCE_INLINE int process_packet(struct __sk_buff *skb, __u16 proto, __u64 ts, __u32 rand, __u64 *rate_exceeded_level)
197 | {
198 | 	__u32 limit = PARAMETER(__u32, "LIMIT");
199 | 	struct hash ph[HASHFN_N];
200 | 	struct in6_addr ipv6;
201 | 	struct in_addr ipv4;
202 | 	__u32 max_rate = 0;
203 | 
204 | 	if (limit == 0) {
205 | 		return SKB_PASS;
206 | 	}
207 | 
208 | 	__u64 troff;
209 | 	switch (proto) {
210 | 	case ETH_P_IP:
211 | 		troff       = transport_offset_ipv4(skb);
212 | 		ipv4.s_addr = load_word(skb, BPF_NET_OFF + offsetof(struct iphdr, saddr));
213 | 		ipv4_hash(ipv4, &ph[0].src, &ph[1].src);
214 | 		ipv4.s_addr = load_word(skb, BPF_NET_OFF + offsetof(struct iphdr, daddr));
215 | 		ipv4_hash(ipv4, &ph[0].dst, &ph[1].dst);
216 | 		break;
217 | 
218 | 	case ETH_P_IPV6:
219 | 		troff = transport_offset_ipv6(skb);
220 | 		if (load_ipv6(&ipv6, skb, offsetof(struct ip6_hdr, ip6_src))) {
221 | 			return SKB_REJECT;
222 | 		}
223 | 		ipv6_hash(&ipv6, &ph[0].src, &ph[1].src);
224 | 		if (load_ipv6(&ipv6, skb, offsetof(struct ip6_hdr, ip6_dst))) {
225 | 			return SKB_REJECT;
226 | 		}
227 | 		ipv6_hash(&ipv6, &ph[0].dst, &ph[1].dst);
228 | 		break;
229 | 
230 | 	default:
231 | 		return SKB_REJECT;
232 | 	}
233 | 
234 | 	__u16 src_port = load_half(skb, troff);
235 | 	ph[0].src_port = fasthash64(&src_port, sizeof(src_port), FH_SEED);
236 | 	ph[1].src_port = hashlittle(&src_port, sizeof(src_port), L3_SEED);
237 | 	__u16 dst_port = load_half(skb, troff + 2);
238 | 	ph[0].dst_port = fasthash64(&dst_port, sizeof(dst_port), FH_SEED);
239 | 	ph[1].dst_port = hashlittle(&dst_port, sizeof(dst_port), L3_SEED);
240 | 
241 | #pragma clang loop unroll(full)
242 | 	for (int i = 0; i < ARRAY_SIZE(generalisations); i++) {
243 | 		const struct gen *gen = &generalisations[i];
244 | 		const int level       = gen->level;
245 | 
246 | 		// Force clang to inline level on the stack rather than loading it from
247 | 		// .rodata later on.
248 | 		asm volatile("" : : "r"(level) : "memory");
249 | 
250 | 		struct cm_hash h = {{
251 | 			gen_hash(gen, &ph[0]),
252 | 			gen_hash(gen, &ph[1]),
253 | 		}};
254 | 
255 | 		__u32 rate = add_to_node(i, ts, &h);
256 | 
257 | 		if (rate > max_rate) {
258 | 			max_rate = rate;
259 | 		}
260 | 
261 | 		if (gen->evaluate) {
262 | 			if (max_rate > limit) {
263 | 				if (rate_exceeded_level != NULL) {
264 | 					*rate_exceeded_level = level;
265 | 				}
266 | 				return drop_or_accept(level, limit, max_rate, rand);
267 | 			}
268 | 
269 | 			max_rate = 0;
270 | 		}
271 | 	}
272 | 
273 | 	return SKB_PASS;
274 | }
275 | 
276 | SEC("socket/ipv4")
277 | int filter_ipv4(struct __sk_buff *skb)
278 | {
279 | 	return process_packet(skb, ETH_P_IP, bpf_ktime_get_ns(), bpf_get_prandom_u32(), NULL);
280 | }
281 | 
282 | SEC("socket/ipv6")
283 | int filter_ipv6(struct __sk_buff *skb)
284 | {
285 | 	return process_packet(skb, ETH_P_IPV6, bpf_ktime_get_ns(), bpf_get_prandom_u32(), NULL);
286 | }
287 | 
288 | // a map used for testing
289 | struct bpf_map_def SEC("maps") test_single_result = {
290 | 	.type        = BPF_MAP_TYPE_ARRAY,
291 | 	.key_size    = sizeof(__u32),
292 | 	.value_size  = sizeof(__u64),
293 | 	.max_entries = 3,
294 | };
295 | 
296 | static FORCE_INLINE int test_filter(struct __sk_buff *skb, __u16 proto)
297 | {
298 | 	__u64 *ts, *randp, *rate_exceeded_level;
299 | 	__u32 rand;
300 | 
301 | 	ts = bpf_map_lookup_elem(&test_single_result, &(__u32){0});
302 | 	if (ts == NULL) {
303 | 		return SKB_PASS;
304 | 	}
305 | 
306 | 	randp = bpf_map_lookup_elem(&test_single_result, &(__u32){1});
307 | 	if (randp == NULL) {
308 | 		return SKB_PASS;
309 | 	}
310 | 
311 | 	if (*randp > 0xffffffff) {
312 | 		rand = bpf_get_prandom_u32();
313 | 	} else {
314 | 		rand = *randp;
315 | 	}
316 | 
317 | 	rate_exceeded_level = bpf_map_lookup_elem(&test_single_result, &(__u32){2});
318 | 	if (rate_exceeded_level == NULL) {
319 | 		return SKB_PASS;
320 | 	}
321 | 
322 | 	// Always reset the level to some weird value that isn't zero.
323 | 	*rate_exceeded_level = -1;
324 | 
325 | 	return process_packet(skb, proto, *ts, rand, rate_exceeded_level);
326 | }
327 | 
328 | SEC("socket/test_ipv4")
329 | int test_ipv4(struct __sk_buff *skb)
330 | {
331 | 	return test_filter(skb, ETH_P_IP);
332 | }
333 | 
334 | SEC("socket/test_ipv6")
335 | int test_ipv6(struct __sk_buff *skb)
336 | {
337 | 	return test_filter(skb, ETH_P_IPV6);
338 | }
339 | 
340 | // test_fp_cmp takes the element with the index 0 out of the test_single_result map, and
341 | // compares if it is equal to some randomly chosen integer converted to a fixed-point (27 in this case).
342 | // Then we do the same thing the other way around and put a converted 19 into the map to ensure the userspace
343 | // implementation does its job as well
344 | SEC("socket/test1")
345 | int test_fp_cmp(struct __sk_buff *skb)
346 | {
347 | 	int i     = 0;
348 | 	__u64 *fp = bpf_map_lookup_elem(&test_single_result, &i);
349 | 	if (fp == NULL) {
350 | 		return __LINE__;
351 | 	}
352 | 	// first check the value from userside
353 | 	if (to_fixed_point(27, 0) != *fp) {
354 | 		return __LINE__;
355 | 	}
356 | 	// then replace it
357 | 	*fp = to_fixed_point(19, 0);
358 | 	bpf_map_update_elem(&test_single_result, &i, fp, 0);
359 | 	return 0;
360 | }
361 | 
362 | // test_ewma takes a previous rate from index 0 (as a u32) and an old and
363 | // new timestamp from index 1-2 (as u64) and estimates the current rate.
364 | // The result is written to the previous rate.
365 | SEC("socket/test2")
366 | int test_ewma(struct __sk_buff *skb)
367 | {
368 | 	__u64 *old_rate = bpf_map_lookup_elem(&test_single_result, &(__u32){0});
369 | 	if (old_rate == NULL) {
370 | 		return SKB_REJECT;
371 | 	}
372 | 
373 | 	__u64 *old_ts = bpf_map_lookup_elem(&test_single_result, &(__u32){1});
374 | 	if (old_ts == NULL) {
375 | 		return SKB_REJECT;
376 | 	}
377 | 
378 | 	__u64 *now = bpf_map_lookup_elem(&test_single_result, &(__u32){2});
379 | 	if (now == NULL) {
380 | 		return SKB_REJECT;
381 | 	}
382 | 
383 | 	*old_rate = estimate_rate(*old_rate, *old_ts, *now);
384 | 	return SKB_PASS;
385 | }
386 | 
387 | char __license[] SEC("license") = "Dual BSD/GPL";
388 | 


--------------------------------------------------------------------------------