├── .lcovrc ├── tools ├── flow-chart.png ├── verifier_log_to_cov_2.sh ├── tubular_mock.py ├── mmdecoy.c └── udpgrm_activate.py ├── crates └── udpgrm │ ├── README.md │ ├── Cargo.toml │ ├── examples │ ├── simple-flip.rs │ ├── tqclient.rs │ ├── client.rs │ └── udpserver-flow.rs │ └── src │ ├── types.rs │ └── lib.rs ├── examples ├── requirements.txt ├── cert.crt ├── cbpf-basic.sh ├── cbpf-naive-quic.sh ├── cert.key ├── http3_simple_client.py ├── http3_simple_server.py └── echoserver.py ├── .clang-format ├── .gitignore ├── tests ├── __init__.py ├── lsocket.py ├── runner.py ├── test_activate.py ├── utils.py ├── test_dissector_noop.py ├── test_dissector_bespoke.py ├── test_dissector_quic.py └── test_tubular.py ├── src ├── usage.txt ├── pidfd.c ├── utils.c ├── metrics.c ├── common.h ├── do_flows.c ├── cgroup.c ├── net.c ├── uspace.c ├── tubular.c └── do_list.c ├── include ├── udpgrm.h └── udpgrm_internal.h ├── ebpf ├── siphash.h ├── ebpf_sha256.c ├── ebpf_internal.h ├── ebpf_aes128.c ├── ebpf_bespoke.c └── ebpf_inter.c ├── Makefile └── LICENSE /.lcovrc: -------------------------------------------------------------------------------- 1 | geninfo_auto_base = 1 2 | lcov_branch_coverage = 1 3 | -------------------------------------------------------------------------------- /tools/flow-chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/udpgrm/HEAD/tools/flow-chart.png -------------------------------------------------------------------------------- /crates/udpgrm/README.md: -------------------------------------------------------------------------------- 1 | # udpgrm 2 | 3 | Rust bindings to udpgrm's API. 4 | 5 | Use this instead of directly depending on udpgrm-sys. -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | aioquic @ git+https://github.com/majek/aioquic@majek/custom-dcid 2 | wsproto 3 | starlette 4 | systemd-python 5 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 8 3 | UseTab: Always 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false 7 | AlwaysBreakBeforeMultilineStrings: true 8 | AllowShortBlocksOnASingleLine: false 9 | 10 | ContinuationIndentWidth: 8 11 | 12 | ColumnLimit: 90 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ebpf*.o 2 | ebpf.skel.h 3 | /udpgrm 4 | /arch/ 5 | *.deb 6 | examples/venv 7 | /tests/__pycache__/ 8 | *gcda 9 | *gcno 10 | *ll 11 | *obj 12 | udpgrm-test 13 | /cov_html/ 14 | cov_verifier_html 15 | mmdecoy 16 | /target/ 17 | *profraw 18 | LLVM* 19 | vhost/ 20 | node_modules/ 21 | crates/udpgrm/target 22 | tqserver 23 | client 24 | bpftool-log.tmp 25 | ebpf.su 26 | cov.info -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | from .test_basic import * # noqa: F401 F403 6 | from .test_dissector_flow import * # noqa: F401 F403 7 | from .test_dissector_quic import * # noqa: F401 F403 8 | from .test_dissector_cbpf import * # noqa: F401 F403 9 | from .test_tubular import * # noqa: F401 F403 10 | from .test_dissector_bespoke import * # noqa: F401 F403 11 | from .test_dissector_noop import * # noqa: F401 F403 12 | -------------------------------------------------------------------------------- /tests/lsocket.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | import socket 6 | from socket import AF_INET, AF_INET6, SOCK_DGRAM, SOL_SOCKET, SO_REUSEPORT, IPPROTO_UDP 7 | UDP_GRM_WORKING_GEN = 200 8 | UDP_GRM_SOCKET_GEN = 201 9 | UDP_GRM_DISSECTOR = 202 10 | UDP_GRM_FLOW_ASSURE = 203 11 | UDP_GRM_SOCKET_APP = 204 12 | 13 | DISSECTOR_FLOW = 0 14 | DISSECTOR_CBPF = 1 15 | DISSECTOR_DIGEST = 3 16 | DISSECTOR_NOOP = 4 17 | DISSECTOR_FLAG_VERBOSE = 0x8000 18 | -------------------------------------------------------------------------------- /tests/runner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | import os 6 | import unittest 7 | 8 | try: 9 | from teamcity.unittestpy import TeamcityTestRunner 10 | 11 | teamcity = True 12 | except ImportError: 13 | teamcity = False 14 | 15 | 16 | def is_running_under_teamcity(): 17 | # We export a different enviroment variable than teamcity package expects, 18 | # i.e. TEAMCITY_VERSION, hence a custom predicate to detect TeamCity 19 | # builds. 20 | return bool(os.getenv("CI")) 21 | 22 | 23 | if __name__ == "__main__": 24 | if teamcity and is_running_under_teamcity(): 25 | runner = TeamcityTestRunner() 26 | else: 27 | # Let unittest create it and _configure_ it that we honor the command 28 | # line options like --verbose. 29 | runner = None 30 | 31 | unittest.main(module=None, testRunner=runner, failfast=True) 32 | -------------------------------------------------------------------------------- /crates/udpgrm/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "udpgrm" 3 | description = "Socket API bindings for udpgrm" 4 | license = "Apache-2.0" 5 | version = "0.1.1" 6 | edition = "2021" 7 | repository = "https://github.com/cloudflare/udpgrm-os" 8 | publish = false 9 | 10 | [features] 11 | default = [] 12 | tokio = ["dep:tokio"] 13 | socket2 = ["dep:socket2"] 14 | all = ["tokio", "socket2"] 15 | 16 | [dependencies] 17 | libc = "0.2" 18 | static_assertions = "1.1" 19 | socket2 = { version = "0.5", optional = true } 20 | tokio = { version = "1", optional = true, features = ["net"] } 21 | 22 | [dev-dependencies] 23 | nix = { version = "0.28", default-features = false, features = ["socket", "uio", "poll", "net"] } 24 | futures = "0.3" 25 | socket2 = { version = "0.5", features = ["all"] } 26 | tokio = { version = "1", features = ["rt", "macros", "net", "time"] } 27 | quiche = "0.24" 28 | ring = "0.17" 29 | mio = { version = "0.8", features = ["net", "os-poll"] } 30 | clap = { version = "4", features = ["derive"] } 31 | url = "2" 32 | signal-hook = "0.3" 33 | tokio-quiche = "0.6" 34 | boring = "4" 35 | udpgrm = { path = ".", features = ["socket2"] } 36 | -------------------------------------------------------------------------------- /examples/cert.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIC7TCCAdUCFDuGBhl3l5Z++VCLkvaav4yteBonMA0GCSqGSIb3DQEBCwUAMEUx 3 | CzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRl 4 | cm5ldCBXaWRnaXRzIFB0eSBMdGQwHhcNMjAwMzIzMTYwNzU0WhcNNDcwODA5MTYw 5 | NzU0WjAhMQswCQYDVQQGEwJHQjESMBAGA1UEAwwJcXVpYy50ZWNoMIIBIjANBgkq 6 | hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAz5bOL7LD9kiIagcVrZqZ13ZcR0KhMuzs 7 | brqULbZKyqC+uBRgINxYJ7LPnJ4LPYuCt/nAaQ7CLXfKgzAMFu8eIK6UEvZA6+7b 8 | 20E4rvOpPbTB/T4JbYZNQKyM9AEwr6j0P6vFgrWT7aBzhkmiqEe5vv/7ZOEGb+Ab 9 | +cvMeszfBbk93nyzKdNaUuh95x7/p0Ow315np2PRuoT0QQnA9zE/9eZ3Jah3cNZn 10 | NuQ6BDHlkegzTV5JhYYblRo/pmt2E9E0ha+NWsRLf3ZJUYhkYR3UqMltEKuLglCO 11 | VWBbPmKd4IZUNIotpKMVQSVb9agNBF49hH9iBhN3fBm7Hp8KBpjJLwIDAQABMA0G 12 | CSqGSIb3DQEBCwUAA4IBAQCo/Rn4spa5XFk0cCoKypP27DxePkGD9rQZk/CY4inV 13 | JV16anZ1pr9yfO61+m3fRKTZq7yxtHRDWxDdROHx9LqV1dXLAmh1ecV9Kn6/796O 14 | EHsOcVB0Lfi9Ili7//oUqlhGNploRuQbgWAXU+Eo1xJRWIXeedhzBSgEOMaQk3Zn 15 | TdYFhP0/Ao/fEdI4VULv1A43ztnZIB2KXWgUQoFT32woL47eWge8LxxVmmH3STtz 16 | nNcGnYxIorCQemDHDzMrvxRWgHxkpFGGqAhkFFyCmhKFPglKwt+yVTx26T8tShID 17 | ISMj0rgVMptmtWKJfzNCvFG52gsuO4w3yGdjgjRRrBDm 18 | -----END CERTIFICATE----- 19 | -------------------------------------------------------------------------------- /src/usage.txt: -------------------------------------------------------------------------------- 1 | "Usage: udpgrm [OPTIONS] [list|flows|metrics]\n" 2 | "Options:\n" 3 | " -h, --help Display this information.\n" 4 | " -p, --pin-dir=DIR Bpffs path to pin progs to. Default: /sys/fs/bpf/udpgrm\n" 5 | " -i, --install=CGROUP Install bpf hooks in given cgroup. By default\n" 6 | " /sys/fs/cgroup/unified and /sys/fs/cgroup are tried.\n" 7 | " -s, --self Append cgroup path extracted from /proc/self/cgroup.\n" 8 | " --without-sendmsg Do not install sendmsg prog, if not using flow dissector.\n" 9 | " -t, --tubular=UDS Path to tubular Unix domain socket. No default.\n" 10 | " -d, --daemon Create bpf programs, pin them to bpffs.\n" 11 | " -v, --verbose Print more stuff.\n" 12 | " -f, --force Force daemon to start even if pins in /sys exist.\n" 13 | "\n" 14 | "See README for how to use udpgrm. Generally there are three modes:\n" 15 | " * daemon mode: run the thing, pin bpf programs and keep it running\n" 16 | " * install: install specific cgroup into running udpgrm\n" 17 | " * instruction: connect to running daemon and run an instruction. Available\n" 18 | " ones are: \"list\", \"flows\", \"metrics\", \"delete\".\n" 19 | "\n" 20 | -------------------------------------------------------------------------------- /examples/cbpf-basic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2025 Cloudflare, Inc. 4 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 5 | # https://opensource.org/licenses/Apache-2.0 6 | 7 | die() { 8 | echo "[!] bpf_asm not found. Please put it in your \$PATH" 9 | echo "[!] you can typically find it in linux/tools/bpf/bpf_asm" 10 | exit 1 11 | } 12 | 13 | which bpf_asm > /dev/null || die 14 | 15 | THISFILE=$0 16 | CUTLINENUMBER=$(awk '/^# -- [A]UTOGENERATED/ {print FNR}' $THISFILE) 17 | sed -i "$[CUTLINENUMBER]q" "$THISFILE" 18 | 19 | cat <> $THISFILE 20 | 21 | ; load first byte to figure out the app number, ASCII 0-3 22 | ldb [0] 23 | jlt #0x30, parse_udpgrm_cookie 24 | jgt #0x33, parse_udpgrm_cookie 25 | sub #0x30 26 | or #0x80000000 27 | ret a 28 | 29 | parse_udpgrm_cookie: 30 | ; the cookie is starting at offset 1 31 | ldh [1] 32 | ret a 33 | EOF 34 | exit 35 | # -- AUTOGENERATED FROM HERE -- 36 | { 0x30, 0, 0, 0000000000 }, 37 | { 0x35, 0, 4, 0x00000030 }, 38 | { 0x25, 3, 0, 0x00000033 }, 39 | { 0x14, 0, 0, 0x00000030 }, 40 | { 0x44, 0, 0, 0x80000000 }, 41 | { 0x16, 0, 0, 0000000000 }, 42 | { 0x28, 0, 0, 0x00000001 }, 43 | { 0x16, 0, 0, 0000000000 }, 44 | -------------------------------------------------------------------------------- /tools/verifier_log_to_cov_2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cat | awk ' 4 | # Match only lines starting with ; and containing @ file:line 5 | /^;/ && /@ [^:]+:[0-9]+/ { 6 | if (current != "") { 7 | print current, count 8 | } 9 | match($0, /@ ([^:]+):([0-9]+)/, arr) 10 | filename = arr[1] 11 | if (filename == "udpgrm_internal.h") { 12 | filename = "include/" filename 13 | } else { 14 | filename = "ebpf/" filename 15 | } 16 | file_count[filename]++ 17 | lineno = arr[2] 18 | current = filename ":" lineno 19 | count = 0 20 | next 21 | } 22 | 23 | # Match only lines that start with a number followed by a colon (e.g., 123: ...) 24 | /^[0-9]+: [(]/ { 25 | if (current != "") { 26 | file_count[filename]++ 27 | count++ 28 | } 29 | } 30 | 31 | END { 32 | if (current != "") { 33 | print current, count 34 | } 35 | #for (f in file_count) { 36 | # print f, file_count[f] 37 | #} 38 | 39 | } 40 | ' | \ 41 | awk -F '[: ]+' ' 42 | { 43 | file = $1 44 | line = $2 45 | count = $3 46 | key = file ":" line 47 | file_line_counts[key] += count 48 | files[file] = 1 49 | } 50 | END { 51 | for (f in files) { 52 | print "TN:" 53 | print "SF:" f 54 | for (k in file_line_counts) { 55 | split(k, parts, ":") 56 | if (parts[1] == f) { 57 | print "DA:" parts[2] "," file_line_counts[k] 58 | } 59 | } 60 | print "end_of_record" 61 | } 62 | } 63 | ' 64 | -------------------------------------------------------------------------------- /tests/test_activate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | from . import base 6 | from .lsocket import * 7 | import os 8 | import socket 9 | import tempfile 10 | 11 | 12 | class ActivateTest(base.TestCase): 13 | def test_activate_usage(self): 14 | p = base.Process(["python3", "examples/activate.py"], close_fds=True) 15 | self.assertTrue(p.collect_stderr("usage")[0]) 16 | p.close() 17 | 18 | def test_activate_simple(self): 19 | p = self.udpgrm_run("--daemon --install") 20 | self.assertTrue(p.collect_stderr("Tailing")[0]) 21 | 22 | notify_sock_dir = tempfile.mkdtemp() 23 | notify_sock_path = os.path.join(notify_sock_dir, "notify.sock") 24 | notify_sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) 25 | notify_sock.bind(notify_sock_path) 26 | 27 | p = base.Process(["python3", "examples/activate.py", "--quic", "-c", "1", "udp_test", 28 | "127.0.0.1:21000"], close_fds=True, env={"NOTIFY_SOCKET": notify_sock_path}) 29 | # Check that no errors are raised 30 | self.assertFalse(p.collect_stderr()) 31 | # Check that systemd receives socket store notifications 32 | self.assertEqual(b"FDSTOREREMOVE=1\nFDNAME=udp_test", 33 | notify_sock.recvmsg(1024)[0]) 34 | self.assertEqual(b"FDSTORE=1\nFDNAME=udp_test", 35 | notify_sock.recvmsg(1024)[0]) 36 | p.close() 37 | notify_sock.close() 38 | os.unlink(notify_sock_path) 39 | os.rmdir(notify_sock_dir) 40 | -------------------------------------------------------------------------------- /examples/cbpf-naive-quic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2025 Cloudflare, Inc. 4 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 5 | # https://opensource.org/licenses/Apache-2.0 6 | 7 | die() { 8 | echo "[!] bpf_asm not found. Please put it in your \$PATH" 9 | echo "[!] you can typically find it in linux/tools/bpf/bpf_asm" 10 | exit 1 11 | } 12 | 13 | which bpf_asm > /dev/null || die 14 | 15 | THISFILE=$0 16 | CUTLINENUMBER=$(awk '/^# -- [A]UTOGENERATED/ {print FNR}' $THISFILE) 17 | sed -i "$[CUTLINENUMBER]q" "$THISFILE" 18 | 19 | cat <> $THISFILE 20 | 21 | ; load first byte to figure out short vs long packet 22 | ldb [0] 23 | and #0x80 24 | jeq #0x80, long_form, short_form 25 | 26 | long_form: 27 | ; in the load packet, check if the length is exactly 16 bytes 28 | ldb [5] 29 | jneq #16, bad_length 30 | ldx #6 31 | jmp parse_dcid 32 | 33 | bad_length: 34 | ret #-1 35 | 36 | short_form: 37 | ldx #1 38 | jmp parse_dcid 39 | 40 | parse_dcid: 41 | ; the cookie is big-endian two bytes in DCID 42 | ldh [x + 0] 43 | ret a 44 | EOF 45 | exit 46 | # -- AUTOGENERATED FROM HERE -- 47 | { 0x30, 0, 0, 0000000000 }, 48 | { 0x54, 0, 0, 0x00000080 }, 49 | { 0x15, 0, 5, 0x00000080 }, 50 | { 0x30, 0, 0, 0x00000005 }, 51 | { 0x15, 0, 2, 0x00000010 }, 52 | { 0x01, 0, 0, 0x00000006 }, 53 | { 0x05, 0, 0, 0x00000003 }, 54 | { 0x06, 0, 0, 0xffffffff }, 55 | { 0x01, 0, 0, 0x00000001 }, 56 | { 0x05, 0, 0, 0000000000 }, 57 | { 0x48, 0, 0, 0000000000 }, 58 | { 0x16, 0, 0, 0000000000 }, 59 | -------------------------------------------------------------------------------- /examples/cert.key: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDPls4vssP2SIhq 3 | BxWtmpnXdlxHQqEy7OxuupQttkrKoL64FGAg3Fgnss+cngs9i4K3+cBpDsItd8qD 4 | MAwW7x4grpQS9kDr7tvbQTiu86k9tMH9Pglthk1ArIz0ATCvqPQ/q8WCtZPtoHOG 5 | SaKoR7m+//tk4QZv4Bv5y8x6zN8FuT3efLMp01pS6H3nHv+nQ7DfXmenY9G6hPRB 6 | CcD3MT/15nclqHdw1mc25DoEMeWR6DNNXkmFhhuVGj+ma3YT0TSFr41axEt/dklR 7 | iGRhHdSoyW0Qq4uCUI5VYFs+Yp3ghlQ0ii2koxVBJVv1qA0EXj2Ef2IGE3d8Gbse 8 | nwoGmMkvAgMBAAECggEBAMtFkpUmablKgTnBwjqCvs47OlUVK6AgW8x5qwuwC0Cr 9 | ctXyLcc/vJry/1UPdVZIvDHGv+Cf8Qhw2r7nV49FiqzaBmki9aOR+3uRPB4kvr6L 10 | t8Fw8+5pqlAAJu3wFGqN+M44N2mswDPaAAWpKTu7MGmVY+f+aT03qG1MYOiGoISK 11 | gP6DHiinddD38spM2muyCUyFZk9a+aBEfaQzZoU3gc0yB6R/qBOWZ7NIoIUMicku 12 | Zf3L6/06uunyZp+ueR83j1YWbg3JoYKlGAuQtDRF709+MQrim8lKTnfuHiBeZKYZ 13 | GNLSo7lGjrp6ccSyfXmlA36hSfdlrWtZJ4+utZShftECgYEA+NNOFNa1BLfDw3ot 14 | a6L4W6FE45B32bLbnBdg8foyEYrwzHLPFCbws1Z60pNr7NaCHDIMiKVOXvKQa78d 15 | qdWuPUVJ83uVs9GI8tAo00RAvBn6ut9yaaLa8mIv6ZpfU20IgE5sDjB7IBY9tTVd 16 | EDyJcDuKQXzQ48qmEw86wINQMd0CgYEA1ZMdt7yLnpDiYa6M/BuKjp7PWKcRlzVM 17 | BcCEYHA4LJ6xEOH4y9DEx2y5ljwOcXgJhXAfAyGQr7s1xiP/nXurqfmdP8u7bawp 18 | VwuWJ8Vv0ZXITaU0isezG2Dpnseuion3qSraWlmWUlWLVVgKETZmk7cF7VIXa0NT 19 | LFREdObI5HsCgYBUbm8KRyi5Zxm4VNbgtTYM8ZYMmdLxPe2i85PjyAABT+IRncuC 20 | jQwT7n5Swc9XWBpiMuFp5J3JPgmfZgRMwsMS61YClqbfk3Qi4FtaBMjqiu43Rubt 21 | zWL56DNV0xoRlufRkcq8rdq5spJR0L+5aLFCMhHh0taW1QaxZPOMq4IkyQKBgQC3 22 | GetubGzewqPyzuz77ri5URm+jW0dT4ofnE9hRpRCXMK9EJ52TkOGHYZ2cIKJcTno 23 | dpl/27Tpk/ykJJSu9SnVDbVszkOf4OuIPty6uCAHdPxG5Q3ItTCulkVz5QmUqHf1 24 | RlHxB8FCUSilQFdRLmx+03h3X9vID+4soQoXlwxAJQKBgE5SQpN+TG5V+E4zHgNd 25 | 6cy6gA5dGDJ0KbsgxJwlKTFA9nIcs2ssBxLY9U4x75EGuqpeVNmq6xwwmPtBs0rp 26 | M3W4zdFrZQ3BneFRW7WbSBbsUSprkJW/p4GXa17GzGUq/MDXlGhNlApP1nknzFvE 27 | xGaH0/H/TZxpLCogVP9npUkj 28 | -----END PRIVATE KEY----- 29 | -------------------------------------------------------------------------------- /tools/tubular_mock.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | import socket 6 | import struct 7 | import stat 8 | import os 9 | import signal 10 | 11 | SO_COOKIE = 57 12 | 13 | signal.signal(signal.SIGPIPE, signal.SIG_DFL) 14 | 15 | s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) 16 | 17 | path = "/tmp/a" 18 | if path and path[0] != '\x00': 19 | try: 20 | if stat.S_ISSOCK(os.stat(path).st_mode): 21 | os.remove(path) 22 | except FileNotFoundError: 23 | pass 24 | 25 | s.bind(path) 26 | s.listen(10) 27 | while True: 28 | c, _ = s.accept() 29 | c.setsockopt(socket.SOL_SOCKET, socket.SO_RCVTIMEO, 30 | struct.pack("ll", 1, 0)) 31 | c.setsockopt(socket.SOL_SOCKET, socket.SO_SNDTIMEO, 32 | struct.pack("ll", 1, 0)) 33 | 34 | while True: 35 | try: 36 | p, ctrl, _flags, _addr = c.recvmsg(1024, 4096) 37 | print("in=%r" % (p, )) 38 | except BlockingIOError: 39 | pass 40 | 41 | fds = [] 42 | for cmsg_level, cmsg_type, cmsg_data in ctrl: 43 | if cmsg_level == socket.SOL_SOCKET: 44 | if cmsg_type == socket.SCM_RIGHTS: 45 | # Parse SCM_RIGHTS message 46 | fds = struct.unpack('i' * (len(cmsg_data) // 4), cmsg_data) 47 | else: 48 | print(f"Unknown cmsg_type {cmsg_type}") 49 | else: 50 | print(f"Unknown cmsg_level {cmsg_level}") 51 | 52 | cookies = [] 53 | for fd in fds: 54 | sd = socket.fromfd(fd, 0, 0, 0) 55 | so_cookie, = struct.unpack( 56 | 'Q', sd.getsockopt(socket.SOL_SOCKET, SO_COOKIE, 8)) 57 | sd.close() 58 | cookies.append("0x%x" % so_cookie) 59 | print("Received sockets with cookies: %s" % (', '.join(cookies))) 60 | if not p or p.rstrip()[-1] == ord(b'#'): 61 | break 62 | try: 63 | c.send(b"OK") 64 | except BrokenPipeError: 65 | pass 66 | c.close() 67 | -------------------------------------------------------------------------------- /include/udpgrm.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-only 2 | /* Copyright (c) 2025 Cloudflare, Inc. 3 | * Licensed under either 4 | * - the Apache 2.0 license found in the LICENSE file, or 5 | * - the GNU General Public License Version 2 found in the ebpf/LICENSE file 6 | * at your option. The licenses are also available online at, respectively: 7 | * https://opensource.org/license/apache-2-0 8 | * https://opensource.org/license/gpl-2-0 9 | */ 10 | 11 | /* Public API for udpgrm */ 12 | 13 | #ifndef UDP_GRM_PUBLIC_H 14 | #define UDP_GRM_PUBLIC_H 15 | 16 | #include 17 | #include 18 | 19 | enum udp_grm_socket_opt { 20 | UDP_GRM_WORKING_GEN = 200, 21 | UDP_GRM_SOCKET_GEN = 201, 22 | UDP_GRM_DISSECTOR = 202, 23 | UDP_GRM_FLOW_ASSURE = 203, 24 | UDP_GRM_SOCKET_APP = 204 25 | }; 26 | 27 | enum udp_grm_dissector_type { 28 | DISSECTOR_FLOW = 0, 29 | DISSECTOR_CBPF = 1, 30 | DISSECTOR_BESPOKE = 3, 31 | DISSECTOR_NOOP = 4, 32 | }; 33 | 34 | enum udp_grm_dissector_flags { 35 | DISSECTOR_FLAG_VERBOSE = 0x8000, 36 | }; 37 | 38 | #define DISSECTOR_FLAGS (DISSECTOR_FLAG_VERBOSE) 39 | #define DISSECTOR_TYPE(x) ((x) & ~DISSECTOR_FLAGS) 40 | 41 | #define MAX_INSTR 64 42 | #define LABEL_SZ 100 43 | 44 | #define MAX_BESPOKE_SNI 8 45 | #define BESPOKE_SNI_LEN 62 46 | 47 | struct udp_grm_dissector { 48 | uint32_t dissector_type; 49 | /* Keep LRU flow entry for how long after last tx. */ 50 | uint32_t flow_entry_timeout_sec; 51 | 52 | uint32_t max_apps; 53 | uint32_t bespoke_digest; 54 | 55 | /* Tubular label */ 56 | char label[LABEL_SZ]; 57 | union { 58 | struct { 59 | uint32_t filter_len; 60 | struct sock_filter filter[MAX_INSTR]; // 8 bytes * 64 == 512 bytes 61 | }; 62 | struct { 63 | uint32_t bespoke_hostname_len; 64 | struct { 65 | uint8_t app; 66 | uint8_t _res; 67 | uint8_t hostname[BESPOKE_SNI_LEN]; 68 | } bespoke_sni[MAX_BESPOKE_SNI]; // 8 strings of 62 bytes 69 | }; 70 | }; 71 | } __attribute__((packed)); 72 | 73 | struct udp_grm_socket_gen { 74 | uint32_t socket_gen; 75 | uint32_t socket_idx; 76 | uint16_t grm_cookie; // Not to be confused with 64bit socket cookie 77 | uint16_t _reserved; 78 | }; 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /crates/udpgrm/examples/simple-flip.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | use socket2::{Domain, Protocol, SockAddr, Socket, Type}; 6 | use std::net::{IpAddr, Ipv4Addr, SocketAddr}; 7 | use udpgrm::types::{UdpGrmDissectorOpts, UdpGrmDissectorType}; 8 | use udpgrm::UdpGrmSupport; 9 | 10 | fn main() { 11 | let socket = Socket::new(Domain::IPV4, Type::DGRAM, Some(Protocol::UDP)).unwrap(); 12 | 13 | socket.set_reuse_port(true).unwrap(); 14 | 15 | socket 16 | .bind(&SockAddr::from(SocketAddr::new( 17 | IpAddr::V4(Ipv4Addr::LOCALHOST), 18 | 5222, 19 | ))) 20 | .unwrap(); 21 | 22 | let mut opts = UdpGrmDissectorOpts::default(); 23 | 24 | opts.dissector_type = UdpGrmDissectorType::DissectorFlow; 25 | opts.flow_entry_timeout_sec = 120; 26 | 27 | match socket.set_dissector(opts) { 28 | Ok(_) => { 29 | // get current working generation 30 | let gen = socket.get_working_gen().unwrap(); 31 | 32 | // set socket to next generation 33 | socket.set_socket_gen(gen + 1).unwrap(); 34 | 35 | // Give udpgrm daemon a moment to register the socket 36 | let mut rgen = Default::default(); 37 | for i in 0..8 { 38 | rgen = socket.get_socket_gen().unwrap(); 39 | if rgen.socket_idx != 0xffffffff { 40 | break; 41 | } 42 | std::thread::sleep(std::time::Duration::from_millis(1) * 2_u32.pow(i)); 43 | } 44 | 45 | assert_ne!(rgen.socket_idx, 0xffffffff); 46 | 47 | // bump socket group generation 48 | socket.set_working_gen(gen + 1).unwrap(); 49 | 50 | // verify working generation was bumped 51 | let gen2 = socket.get_working_gen().unwrap(); 52 | 53 | assert_eq!(gen + 1, gen2); 54 | 55 | println!("Yay, it worked!"); 56 | } 57 | Err(e) if e.raw_os_error().unwrap() == libc::ENOPROTOOPT => { 58 | println!("cgroups hooks not loaded"); 59 | } 60 | Err(e) => { 61 | eprintln!("Failed to get working generation: {e}"); 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /crates/udpgrm/src/types.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | pub const UDP_GRM_WORKING_GEN: libc::c_int = 200; 6 | pub const UDP_GRM_SOCKET_GEN: libc::c_int = 201; 7 | pub const UDP_GRM_DISSECTOR: libc::c_int = 202; 8 | pub const UDP_GRM_FLOW_ASSURE: libc::c_int = 203; 9 | pub const UDP_GRM_SOCKET_APP: libc::c_int = 204; 10 | 11 | #[repr(u32)] 12 | #[non_exhaustive] 13 | #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug, Default)] 14 | pub enum UdpGrmDissectorType { 15 | #[default] 16 | DissectorFlow = 0, 17 | DissectorCbpf = 1, 18 | DissectorBespoke = 3, 19 | DissectorNoop = 4, 20 | } 21 | 22 | #[repr(u32)] 23 | #[non_exhaustive] 24 | #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug)] 25 | pub enum UdpGrmDissectorFlags { 26 | DissectorFlagVerbose = 0x8000, 27 | } 28 | 29 | #[repr(C, packed)] 30 | #[non_exhaustive] 31 | #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug)] 32 | pub struct UdpGrmDissectorOpts { 33 | pub dissector_type: UdpGrmDissectorType, 34 | pub flow_entry_timeout_sec: u32, 35 | pub max_apps: u32, 36 | pub _res2: u32, 37 | pub label: [u8; 100], 38 | pub filter_len: u32, 39 | pub sock_filter: [SockFilter; 64], 40 | } 41 | 42 | static_assertions::assert_eq_size!(UdpGrmDissectorOpts, [u8; 116 + 4 + 512]); 43 | 44 | #[repr(C)] 45 | #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug, Default)] 46 | pub struct SockFilter { 47 | pub code: u16, 48 | pub jt: u8, 49 | pub jf: u8, 50 | pub k: u32, 51 | } 52 | 53 | static_assertions::assert_eq_size!(SockFilter, [u8; 8]); 54 | 55 | #[repr(C)] 56 | #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug, Default)] 57 | pub struct UdpGrmSocketGen { 58 | pub socket_gen: u32, 59 | pub socket_idx: u32, 60 | pub grm_cookie: u16, 61 | _reserved: u16, 62 | } 63 | 64 | static_assertions::assert_eq_size!(UdpGrmSocketGen, [u8; 12]); 65 | 66 | impl Default for UdpGrmDissectorOpts { 67 | fn default() -> Self { 68 | Self { 69 | dissector_type: UdpGrmDissectorType::DissectorFlow, 70 | flow_entry_timeout_sec: 0, 71 | max_apps: 0, 72 | _res2: 0, 73 | label: [0; 100], 74 | filter_len: 0, 75 | sock_filter: [Default::default(); 64], 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | import re 6 | import shlex 7 | import string 8 | 9 | 10 | def test_encode_shell(): 11 | a = shlex.split(encode_shell(["a", "b", "c"])) 12 | assert a == ["a", "b", "c"] 13 | a = shlex.split(encode_shell(["\\a", "!b", "\tc"])) 14 | assert a == ["\\a", "!b", "\tc"] 15 | a = shlex.split(encode_shell(["\\a \tc"])) 16 | assert a == ["\\a \tc"] 17 | a = shlex.split(encode_shell(["\"'"])) 18 | assert a == ["\"'"], repr(a) 19 | a = shlex.split(encode_shell(['"abc'])) 20 | assert a == ['"abc'] 21 | a = shlex.split(encode_shell(["'abc\""])) 22 | assert a == ["'abc\""] 23 | a = shlex.split("--test=\"masala chicken\" --test='chicken masala'") 24 | assert a == ["--test=masala chicken", "--test=chicken masala"] 25 | a = encode_shell(["--test=masala chicken", "--test=chicken masala"]) 26 | assert a == "--test='masala chicken' --test='chicken masala'" 27 | 28 | 29 | # The opposite of shlex.split(). It doesn't matter how the stuff is 30 | # going to be encoded, as long as shlex() and potentially bash will 31 | # parse it the same way. With regard to tabs and special chars we 32 | # kindof lost, as passing them via bash is hard. But we should make 33 | # sure at least quotes and spaces work as intended. 34 | # 35 | # Thre is a special exception for parsing --param=argument syntax. 36 | # Although technicall sound, most likely you don't want to encode it 37 | # like that: ' "--param=the argument" ', you most likely want: 38 | # '--pram="the argument"', so there's an exception for it. 39 | 40 | PARAM = re.compile("^--(?P[a-z_-]+)[ =](?P.*)$") 41 | ACCEPTABLE_CHARS = set(string.printable) - \ 42 | set(string.whitespace) - set("'\"\\&#!`()[]{}$|") 43 | 44 | 45 | def encode_shell(params): 46 | r""" 47 | >>> test_encode_shell() 48 | """ 49 | s = [] 50 | for token in params: 51 | m = PARAM.match(token) 52 | if m: 53 | m = m.groupdict() 54 | token = m["rest"] 55 | if not set(token) - ACCEPTABLE_CHARS: 56 | enc_token = token 57 | else: 58 | if "'" not in token: 59 | enc_token = "'" + token + "'" 60 | else: 61 | t = token.replace("`", "\\`").replace('"', '\\"') 62 | enc_token = '"' + t + '"' 63 | if not m: 64 | s.append(enc_token) 65 | else: 66 | s.append("--%s=%s" % (m["opt"], enc_token)) 67 | return " ".join(s) 68 | -------------------------------------------------------------------------------- /tests/test_dissector_noop.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | import os 6 | import select 7 | import threading 8 | import time 9 | 10 | from . import base 11 | from .lsocket import * 12 | from struct import pack 13 | 14 | 15 | class DissectorNoop(base.TestCase): 16 | def test_noop(self): 17 | p = self.udpgrm_run("--daemon --install") 18 | self.assertTrue(p.collect_stderr("Tailing message ring")[0]) 19 | 20 | sd, port = self.bind() 21 | v = pack("I", DISSECTOR_NOOP) 22 | sd.setsockopt(IPPROTO_UDP, UDP_GRM_DISSECTOR, v) 23 | sd.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 0) 24 | self.sync_socket_gen(sd) 25 | sd.setblocking(False) 26 | sd.setsockopt(IPPROTO_UDP, UDP_GRM_WORKING_GEN, 0) 27 | 28 | cd = self.connect() 29 | 30 | def sender_loop(var): 31 | while not var[0]: 32 | time.sleep(0.0005) 33 | cd.send(b"hello") 34 | var[1] += 1 35 | 36 | var = [False, 0] 37 | thread = threading.Thread(target=sender_loop, args=(var,)) 38 | thread.daemon = True 39 | thread.start() 40 | 41 | count = 0 42 | # Make the timeouts to send >1k packets and ~1s roughly 43 | for i in range(1, 65): 44 | sa, _ = self.bind(port=port) 45 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, i) 46 | self.sync_socket_gen(sa) 47 | sa.setblocking(False) 48 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_WORKING_GEN, i) 49 | time.sleep(0.01) 50 | while True: 51 | try: 52 | sd.recvmsg(1024, 0, socket.MSG_DONTWAIT) 53 | except (TimeoutError, BlockingIOError): 54 | break 55 | count += 1 56 | sd.close() 57 | self._del_teardown(sd) 58 | sd = sa 59 | 60 | var[0] = True 61 | thread.join() 62 | 63 | while True: 64 | try: 65 | sd.recvmsg(1024, 0, socket.MSG_DONTWAIT) 66 | except (TimeoutError, BlockingIOError): 67 | break 68 | count += 1 69 | 70 | # the packet count must match exactly, not even one packet lost 71 | self.assertEqual(var[1], count) 72 | D, M = self.metrics_delta({}) 73 | 74 | # The metrics are arguably boring 75 | self.assertEqual(D, {'rx_dissected_ok_total': count, 76 | 'rx_flow_new_unseen': count, 77 | 'rx_new_flow_total': count, 78 | 'rx_new_flow_working_gen_dispatch_ok': count, 79 | 'rx_processed_total': count}) 80 | -------------------------------------------------------------------------------- /ebpf/siphash.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0-only 2 | // Copyright (c) 2025 Cloudflare, Inc. 3 | // Licensed under the GNU General Public License Version 2 found in the ebpf/LICENSE file or at: 4 | // https://opensource.org/license/gpl-2-0 5 | 6 | #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN 7 | #define _le64toh(x) ((uint64_t)(x)) 8 | #else 9 | #define _le64toh(x) le64toh(x) 10 | #endif 11 | 12 | #define ROTATE(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) 13 | 14 | #define HALF_ROUND(a, b, c, d, s, t) \ 15 | a += b; \ 16 | c += d; \ 17 | b = ROTATE(b, s) ^ a; \ 18 | d = ROTATE(d, t) ^ c; \ 19 | a = ROTATE(a, 32); 20 | 21 | #define DOUBLE_ROUND(v0, v1, v2, v3) \ 22 | HALF_ROUND(v0, v1, v2, v3, 13, 16); \ 23 | HALF_ROUND(v2, v1, v0, v3, 17, 21); \ 24 | HALF_ROUND(v0, v1, v2, v3, 13, 16); \ 25 | HALF_ROUND(v2, v1, v0, v3, 17, 21); 26 | 27 | #define ROUND(v0, v1, v2, v3) \ 28 | HALF_ROUND(v0, v1, v2, v3, 13, 16); \ 29 | HALF_ROUND(v2, v1, v0, v3, 17, 21); 30 | 31 | static uint32_t hsiphash(const void *src, unsigned long src_sz, const char key[16]) 32 | { 33 | const uint64_t *_key = (uint64_t *)key; 34 | uint64_t k0 = _le64toh(_key[0]); 35 | uint64_t k1 = _le64toh(_key[1]); 36 | uint64_t b = (uint64_t)src_sz << 56; 37 | const uint64_t *in = (uint64_t *)src; 38 | 39 | uint64_t v0 = k0 ^ 0x736f6d6570736575ULL; 40 | uint64_t v1 = k1 ^ 0x646f72616e646f6dULL; 41 | uint64_t v2 = k0 ^ 0x6c7967656e657261ULL; 42 | uint64_t v3 = k1 ^ 0x7465646279746573ULL; 43 | 44 | while (src_sz >= 8) { 45 | uint64_t mi = _le64toh(*in); 46 | in += 1; 47 | src_sz -= 8; 48 | v3 ^= mi; 49 | ROUND(v0, v1, v2, v3); 50 | v0 ^= mi; 51 | } 52 | 53 | uint64_t t = 0; 54 | uint8_t *pt = (uint8_t *)&t; 55 | uint8_t *m = (uint8_t *)in; 56 | switch (src_sz) { 57 | case 7: 58 | pt[6] = m[6]; 59 | /* fallthrough */ 60 | case 6: 61 | pt[5] = m[5]; 62 | /* fallthrough */ 63 | case 5: 64 | pt[4] = m[4]; 65 | /* fallthrough */ 66 | case 4: 67 | *((uint32_t *)&pt[0]) = *((uint32_t *)&m[0]); 68 | break; 69 | case 3: 70 | pt[2] = m[2]; 71 | /* fallthrough */ 72 | case 2: 73 | pt[1] = m[1]; 74 | /* fallthrough */ 75 | case 1: 76 | pt[0] = m[0]; 77 | } 78 | b |= _le64toh(t); 79 | 80 | v3 ^= b; 81 | ROUND(v0, v1, v2, v3); 82 | v0 ^= b; 83 | v2 ^= 0xff; 84 | ROUND(v0, v1, v2, v3); 85 | ROUND(v0, v1, v2, v3); 86 | return (v0 ^ v1) ^ (v2 ^ v3); 87 | } 88 | -------------------------------------------------------------------------------- /examples/http3_simple_client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | import asyncio 6 | from aioquic.asyncio.protocol import QuicConnectionProtocol 7 | from aioquic.asyncio import connect 8 | from aioquic.h3.connection import H3_ALPN, H3Connection 9 | from aioquic.h3.events import HeadersReceived, DataReceived 10 | from aioquic.quic.configuration import QuicConfiguration 11 | 12 | 13 | class Http3Client(QuicConnectionProtocol): 14 | def __init__(self, *args, **kwargs): 15 | super().__init__(*args, **kwargs) 16 | self._http = None 17 | self._response_complete = asyncio.Event() # Event to signal end of stream 18 | self._body = [] 19 | 20 | def quic_event_received(self, event): 21 | if getattr(self, "_http", None) is None: 22 | self._http = H3Connection(self._quic) 23 | 24 | for http_event in self._http.handle_event(event): 25 | self.http_event_received(http_event) 26 | 27 | def http_event_received(self, event): 28 | if isinstance(event, HeadersReceived): 29 | pass 30 | # print("Headers received: %s" % event.headers) 31 | elif isinstance(event, DataReceived): 32 | self._body.append(event.data) 33 | # print("Data received: %s" %event.data.decode("utf-8")) 34 | if event.stream_ended: 35 | self._response_complete.set() # Signal the end of stream 36 | 37 | def get_body(self): 38 | return b''.join(self._body) 39 | 40 | 41 | async def perform_get_request(ip, port, sni, path): 42 | # Create QUIC configuration 43 | configuration = QuicConfiguration( 44 | alpn_protocols=H3_ALPN, 45 | is_client=True, 46 | server_name=sni, 47 | verify_mode=False, 48 | ) 49 | 50 | # Connect to the server 51 | async with connect( 52 | host=ip, 53 | port=port, 54 | configuration=configuration, 55 | create_protocol=Http3Client 56 | ) as protocol: 57 | http = protocol._http 58 | stream_id = protocol._quic.get_next_available_stream_id() 59 | http.send_headers( 60 | stream_id=stream_id, 61 | headers=[ 62 | (b":method", b"GET"), 63 | (b":scheme", b"https"), 64 | (b":authority", sni.encode()), 65 | (b":path", path.encode()), 66 | ] 67 | ) 68 | http.send_data(stream_id, b"", end_stream=True) 69 | await protocol._response_complete.wait() 70 | return protocol.get_body() 71 | 72 | if __name__ == "__main__": 73 | import sys 74 | 75 | if len(sys.argv) != 4: 76 | print("Usage: python http3_client.py ") 77 | sys.exit(1) 78 | 79 | hostname = sys.argv[1] 80 | port = int(sys.argv[2]) 81 | sni = sys.argv[3] 82 | 83 | x = asyncio.run(perform_get_request(hostname, port, sni, '/')) 84 | print(x.decode().rstrip()) 85 | -------------------------------------------------------------------------------- /tests/test_dissector_bespoke.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | from struct import pack, unpack 6 | from . import base 7 | from .lsocket import * 8 | import random 9 | import select 10 | import shlex 11 | 12 | TQCLIENT_BIN = "./client" 13 | TQSERVER_BIN = "./tqserver --crt examples/cert.crt --key examples/cert.key" 14 | 15 | 16 | def quic_client(port_or_addr, list_of_sni): 17 | if isinstance(port_or_addr, int) or ':' not in port_or_addr: 18 | addr = '127.0.0.1:%d' % (port_or_addr,) 19 | else: 20 | addr = port_or_addr 21 | argv0 = shlex.split(TQCLIENT_BIN) 22 | cmd = argv0 + ["--target", addr] + list_of_sni 23 | p = base.Process(cmd) 24 | r = p.collect_stdout() 25 | p.close() 26 | return '\n'.join(r).strip() 27 | 28 | 29 | def set_apps(self, sni, apps_max=4, tubular=b''): 30 | digest = 0xDEAD 31 | socks = [] 32 | port = 0 33 | for i in range(apps_max): 34 | s, port = self.bind(port=port) 35 | if i == 0: 36 | v = pack("IIII100sI512s", DISSECTOR_DIGEST, 37 | 0, apps_max, digest, tubular, 38 | len(sni), b''.join(pack('BB62s', app, 0, bytes(name, "utf-8")) for name, app in sni)) 39 | s.setsockopt(IPPROTO_UDP, UDP_GRM_DISSECTOR, v) 40 | 41 | socks.append(s) 42 | s.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_APP, i) 43 | s.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 1) 44 | s.setsockopt(IPPROTO_UDP, UDP_GRM_WORKING_GEN, 1) 45 | self.sync_socket_gen(s) 46 | 47 | # it's enough to sync last socket 48 | return socks, port 49 | 50 | 51 | class TestDissectorDigest(base.TestCase): 52 | def h3_srv_run(self, argv1=[], pass_fds=[]): 53 | pass_fds = [fd.fileno() for fd in pass_fds] 54 | argv0 = shlex.split(TQSERVER_BIN) 55 | 56 | if isinstance(argv1, str): 57 | argv1 = shlex.split(argv1) 58 | 59 | cmd = argv0 + argv1 60 | p = base.Process(cmd, pass_fds=pass_fds) 61 | self._add_teardown(p) 62 | return p 63 | 64 | def test_digest(self): 65 | p = self.udpgrm_run("--daemon --install") 66 | self.assertTrue(p.collect_stderr("Tailing")[0]) 67 | sni = [('a.e.com', 1), 68 | ('b.e.com', 2), 69 | ('c.e.com', 3), 70 | ('e.com', 3), 71 | ('', 0)] 72 | list_of_fds, port = set_apps(self, sni) 73 | cookies = list(map(lambda fd: '#'+fd.cookie(), list_of_fds)) 74 | snimap = dict(((n, cookies[i]) for n, i in sni)) 75 | 76 | srv = self.h3_srv_run(pass_fds=list_of_fds) 77 | self.assertIn('from activation', srv.stdout_line()) 78 | self.assertIn('from activation', srv.stdout_line()) 79 | self.assertIn('from activation', srv.stdout_line()) 80 | self.assertIn('from activation', srv.stdout_line()) 81 | 82 | for hname in ['a.e.com', 'b.e.com', 'c.e.com', 'e.com', 'bad.com']: 83 | recv_cookie = quic_client(port, ['https://'+hname]) 84 | self.assertEqual(recv_cookie, snimap.get(hname, cookies[0])) 85 | -------------------------------------------------------------------------------- /src/pidfd.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "common.h" 14 | 15 | int pidfd_open(pid_t pid, unsigned int flags) 16 | { 17 | return syscall(__NR_pidfd_open, pid, flags); 18 | } 19 | 20 | static int pidfd_getfd(pid_t pid, int fd, unsigned int flags) 21 | { 22 | return syscall(__NR_pidfd_getfd, pid, fd, flags); 23 | } 24 | 25 | /* Steal a network socket file descriptor from a process. Iterates 26 | * over all the fd's of a pid, until 16k or max_continous_gap 27 | * reached. For each fd checks socket family/type/protocol, for 28 | * matching ones validates basica address/port from sockaddr. Finally, 29 | * if all looks right checks socket cookie. We need to check socket 30 | * cookie last, since calling SO_COOKIE on a socket might mutate it 31 | * (generate cookie). We don't want to generate cookies really. 32 | * Returns a fd on success, and -1 on failure. */ 33 | int pidfd_find_socket(int pidfd, int max_continous_gap, int type, int protocol, 34 | struct sockaddr_storage *addr, uint64_t cookie) 35 | { 36 | int i; 37 | int gap = 0; 38 | for (i = 0; i < 16 * 1024 && gap < max_continous_gap; i++) { 39 | int f = pidfd_getfd(pidfd, i, 0); 40 | if (f < 0) { 41 | gap += 1; 42 | continue; 43 | } 44 | gap = 0; 45 | 46 | int r; 47 | int v; 48 | socklen_t v_sz = sizeof(v); 49 | r = getsockopt(f, SOL_SOCKET, SO_DOMAIN, &v, &v_sz); 50 | if (r != 0 || v != addr->ss_family) 51 | goto next; 52 | r = getsockopt(f, SOL_SOCKET, SO_TYPE, &v, &v_sz); 53 | if (r != 0 || v != type) 54 | goto next; 55 | r = getsockopt(f, SOL_SOCKET, SO_PROTOCOL, &v, &v_sz); 56 | if (r != 0 || v != protocol) 57 | goto next; 58 | 59 | struct sockaddr_storage ss; 60 | socklen_t ss_len = sizeof(ss); 61 | getsockname(f, (struct sockaddr *)&ss, &ss_len); 62 | 63 | if (ss.ss_family != addr->ss_family) 64 | goto next; 65 | 66 | switch (ss.ss_family) { 67 | case AF_INET: { 68 | struct sockaddr_in *sina = (struct sockaddr_in *)&ss; 69 | struct sockaddr_in *sinb = (struct sockaddr_in *)addr; 70 | if (sina->sin_port != sinb->sin_port) 71 | goto next; 72 | if (sina->sin_addr.s_addr != sinb->sin_addr.s_addr) 73 | goto next; 74 | break; 75 | } 76 | case AF_INET6: { 77 | struct sockaddr_in6 *sin6a = (struct sockaddr_in6 *)&ss; 78 | struct sockaddr_in6 *sin6b = (struct sockaddr_in6 *)addr; 79 | if (sin6a->sin6_port != sin6b->sin6_port) 80 | goto next; 81 | if (memcmp(&sin6a->sin6_addr, &sin6b->sin6_addr, 16) != 0) 82 | goto next; 83 | break; 84 | } 85 | default: 86 | goto next; 87 | } 88 | 89 | uint64_t c; 90 | v_sz = sizeof(c); 91 | r = getsockopt(f, SOL_SOCKET, SO_COOKIE, &c, &v_sz); 92 | if (r != 0 || c != cookie) 93 | goto next; 94 | return (f); 95 | 96 | next: 97 | close(f); 98 | continue; 99 | } 100 | return -1; 101 | } 102 | -------------------------------------------------------------------------------- /src/utils.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "common.h" 16 | 17 | const char *optstring_from_long_options(const struct option *opt) 18 | { 19 | static char optstring[256] = {0}; 20 | char *osp = optstring; 21 | 22 | for (; opt->name != NULL; opt++) { 23 | if (opt->flag == 0 && opt->val > 0 && opt->val < 256) { 24 | *osp++ = opt->val; 25 | switch (opt->has_arg) { 26 | case optional_argument: 27 | *osp++ = ':'; 28 | *osp++ = ':'; 29 | break; 30 | case required_argument: 31 | *osp++ = ':'; 32 | break; 33 | } 34 | } 35 | } 36 | *osp++ = '\0'; 37 | 38 | if (osp - optstring >= (int)sizeof(optstring)) { 39 | abort(); 40 | } 41 | 42 | return optstring; 43 | } 44 | 45 | int signal_desc(int *sig, int sig_num) 46 | { 47 | sigset_t mask; 48 | sigemptyset(&mask); 49 | int i; 50 | for (i = 0; i < sig_num; i++) { 51 | sigaddset(&mask, sig[i]); 52 | } 53 | 54 | if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) { 55 | error(-1, errno, "sigprocmask(SIG_BLOCK)"); 56 | } 57 | 58 | int sfd = signalfd(-1, &mask, SFD_NONBLOCK); 59 | if (sfd == -1) { 60 | error(-1, errno, "signalfd()"); 61 | } 62 | return sfd; 63 | } 64 | 65 | int fprintf_hex(FILE *out, char *desc, void *addr, int len) 66 | { 67 | const char hex[] = "0123456789abcdef"; 68 | int i, lines = 0; 69 | char line[128]; 70 | memset(line, ' ', 128); 71 | uint8_t *pc = (uint8_t *)addr; 72 | 73 | if (desc != NULL) { 74 | fprintf(out, "%s:\n", desc); 75 | } 76 | 77 | for (i = 0; i < len; i++) { 78 | if ((i % 16) == 0) { 79 | if (i != 0) { 80 | fprintf(out, "%.*s\n", 128, line); 81 | lines++; 82 | } 83 | snprintf(line, 128, " 0x%04x: ", i); 84 | } 85 | 86 | line[10 + (i % 16) * 3 + 0] = hex[(pc[i] >> 4) & 0xf]; 87 | line[10 + (i % 16) * 3 + 1] = hex[pc[i] & 0xf]; 88 | 89 | if ((pc[i] < 0x20) || (pc[i] > 0x7e)) { 90 | line[59 + (i % 16)] = '.'; 91 | } else { 92 | line[59 + (i % 16)] = pc[i]; 93 | } 94 | } 95 | 96 | while ((i % 16) != 0) { 97 | line[10 + (i % 16) * 3 + 0] = ' '; 98 | line[10 + (i % 16) * 3 + 1] = ' '; 99 | line[59 + (i % 16)] = ' '; 100 | i++; 101 | } 102 | 103 | fprintf(out, "%.*s\n", 128, line); 104 | lines++; 105 | return lines; 106 | } 107 | 108 | size_t snprintfcat(char *buf, size_t size, char const *fmt, ...) 109 | { 110 | size_t result; 111 | va_list args; 112 | size_t len = strnlen(buf, size); 113 | 114 | va_start(args, fmt); 115 | result = vsnprintf(buf + len, size - len, fmt, args); 116 | va_end(args); 117 | 118 | return result + len; 119 | } 120 | 121 | void bump_memlock_rlimit(void) 122 | { 123 | struct rlimit rlim_new = { 124 | .rlim_cur = RLIM_INFINITY, 125 | .rlim_max = RLIM_INFINITY, 126 | }; 127 | 128 | if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) { 129 | fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n"); 130 | exit(1); 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/metrics.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "common.h" 11 | 12 | #ifndef PACKAGE_VERSION 13 | #define PACKAGE_VERSION "dev" 14 | #endif 15 | 16 | void init_metrics(int prog_fd) 17 | { 18 | int metrics_map_fd = map_from_prog(prog_fd, "metrics_map", NULL); 19 | if (metrics_map_fd < 0) 20 | error(-1, errno, "map_from_prog()"); 21 | 22 | unsigned int metrics_key = 0; 23 | metrics_t metrics_value = {0}; 24 | strncpy(metrics_value.package_version, PACKAGE_VERSION, 25 | sizeof(metrics_value.package_version) - 1); 26 | 27 | int r = bpf_map_update_elem(metrics_map_fd, &metrics_key, &metrics_value, 0); 28 | if (r < 0) 29 | error(-1, r, "bpf_map_update_elem()"); 30 | } 31 | 32 | metrics_t get_metrics(int prog_fd) 33 | { 34 | int metrics_map_fd = map_from_prog(prog_fd, "metrics_map", NULL); 35 | if (metrics_map_fd < 0) 36 | error(-1, errno, "map_from_prog()"); 37 | 38 | unsigned int metrics_key = 0; 39 | metrics_t metrics_value; 40 | 41 | int r = bpf_map_lookup_elem(metrics_map_fd, &metrics_key, &metrics_value); 42 | if (r < 0) 43 | error(-1, r, "bpf_map_lookup_elem()"); 44 | 45 | return metrics_value; 46 | } 47 | 48 | void do_socket_metrics(struct reuseport_storage_key *key, struct reuseport_storage *s) 49 | { 50 | char *t = key_to_str(key); 51 | 52 | uint32_t max_apps = s->dis.max_apps; 53 | if (max_apps == 0) { 54 | max_apps = 1; 55 | } 56 | 57 | for (uint32_t app_idx = 0; app_idx < max_apps; app_idx++) { 58 | printf("working_gen{socket=\"%s\",app_idx=\"%d\"} %d\n", t, app_idx, 59 | s->working_gen[app_idx]); 60 | } 61 | 62 | #define METRIC(token) \ 63 | if (s->token > 0) { \ 64 | printf(#token "{socket=\"%s\"} %lu\n", t, s->token); \ 65 | } 66 | 67 | METRIC(socket_critical_gauge); 68 | METRIC(socket_critical); 69 | 70 | METRIC(rx_processed_total); 71 | METRIC(rx_internal_state_error); 72 | METRIC(rx_cbpf_prog_error); 73 | METRIC(rx_packet_too_short_error); 74 | 75 | METRIC(rx_dissected_ok_total); 76 | METRIC(rx_flow_ok); 77 | METRIC(rx_flow_rg_conflict); 78 | METRIC(rx_flow_other_error); 79 | METRIC(rx_flow_new_unseen); 80 | METRIC(rx_flow_new_had_expired); 81 | METRIC(rx_flow_new_bad_cookie); 82 | 83 | METRIC(rx_new_flow_total); 84 | METRIC(rx_new_flow_working_gen_dispatch_ok); 85 | METRIC(rx_new_flow_working_gen_dispatch_error); 86 | 87 | METRIC(tx_total); 88 | METRIC(tx_flow_create_ok); 89 | METRIC(tx_flow_create_from_expired_ok); 90 | METRIC(tx_flow_create_error); 91 | METRIC(tx_flow_update_ok); 92 | METRIC(tx_flow_update_conflict); 93 | } 94 | 95 | void do_metrics(int prog_fd, int map_fd) 96 | { 97 | int metrics_map_fd = map_from_prog(prog_fd, "metrics_map", NULL); 98 | if (metrics_map_fd < 0) 99 | error(-1, errno, "map_from_prog()"); 100 | 101 | metrics_t metrics = get_metrics(prog_fd); 102 | 103 | printf("build_info{version=\"%s\"} 1\n", metrics.package_version); 104 | 105 | struct reuseport_storage_key key = {}; 106 | int err = 0; 107 | bpf_map_get_next_key(map_fd, NULL, &key); 108 | while (!err) { 109 | struct reuseport_storage s = {}; 110 | int r = bpf_map_lookup_elem(map_fd, &key, &s); 111 | if (r == 0) { 112 | do_socket_metrics(&key, &s); 113 | } 114 | 115 | err = bpf_map_get_next_key(map_fd, &key, &key); 116 | } 117 | } -------------------------------------------------------------------------------- /crates/udpgrm/examples/tqclient.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | use clap::Parser; 6 | use std::error::Error; 7 | use tokio_quiche::http3::driver::{ClientH3Event, H3Event, InboundFrame, IncomingH3Headers}; 8 | use tokio_quiche::quiche::h3; 9 | 10 | #[derive(Parser, Debug)] 11 | #[command(author, version, about, long_about = None)] 12 | struct Args { 13 | #[arg(short, long)] 14 | verbose: bool, 15 | 16 | #[arg(short, long, value_name = "IP:PORT", value_parser = clap::value_parser!(std::net::SocketAddr))] 17 | target: std::net::SocketAddr, 18 | 19 | #[arg(value_name = "URL")] 20 | url: Vec, 21 | } 22 | 23 | #[tokio::main(flavor = "current_thread")] 24 | async fn main() -> Result<(), Box> { 25 | let args = Args::parse(); 26 | 27 | for url in args.url { 28 | let host = url.host_str().unwrap_or("").to_string(); 29 | let path = url.path(); 30 | let query = url.query().map(|q| format!("?{}", q)).unwrap_or_default(); 31 | let full_path = format!("{}{}", path, query); 32 | 33 | let socket = tokio::net::UdpSocket::bind("0.0.0.0:0").await?; 34 | socket.connect(args.target).await.unwrap(); 35 | 36 | println!("target {} host {}\n", args.target, host); 37 | let (_, mut controller) = tokio_quiche::quic::connect(socket, Some(&host)) 38 | .await 39 | .unwrap(); 40 | println!("post\n"); 41 | 42 | controller 43 | .request_sender() 44 | .send(tokio_quiche::http3::driver::NewClientRequest { 45 | request_id: 1, 46 | headers: vec![ 47 | h3::Header::new(b":method", b"GET"), 48 | h3::Header::new(b":scheme", url.scheme().as_bytes()), 49 | h3::Header::new(b":authority", host.as_bytes()), 50 | h3::Header::new(b":path", full_path.as_bytes()), 51 | ], 52 | body_writer: None, 53 | }) 54 | .unwrap(); 55 | 56 | while let Some(event) = controller.event_receiver_mut().recv().await { 57 | match event { 58 | ClientH3Event::Core(H3Event::IncomingHeaders(IncomingH3Headers { 59 | stream_id: _, 60 | headers, 61 | mut recv, 62 | .. 63 | })) => { 64 | if args.verbose { 65 | println!("{:?}", headers); 66 | } 67 | 'body: while let Some(frame) = recv.recv().await { 68 | match frame { 69 | InboundFrame::Body(pooled, fin) => { 70 | match std::str::from_utf8(&pooled) { 71 | Ok("") => (), 72 | Ok(body) => print!("{}", body), 73 | Err(_) => (), 74 | } 75 | if fin { 76 | break 'body; 77 | } 78 | } 79 | InboundFrame::Datagram(_pooled) => {} 80 | } 81 | } 82 | } 83 | ClientH3Event::Core(H3Event::BodyBytesReceived { fin: true, .. }) => { 84 | break; 85 | } 86 | ClientH3Event::Core(_event) => (), 87 | ClientH3Event::NewOutboundRequest { .. } => (), 88 | } 89 | } 90 | } 91 | Ok(()) 92 | } 93 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "../include/udpgrm_internal.h" 14 | 15 | #ifndef IP_PKTINFO 16 | #define IP_PKTINFO 8 17 | #endif 18 | 19 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 20 | 21 | /* utils.c */ 22 | const char *optstring_from_long_options(const struct option *opt); 23 | void setup_itimer(int seconds, int *tick); 24 | void setup_ctrlc(int *done); 25 | int signal_desc(int *sig, int sig_num); 26 | 27 | uint64_t suffix(uint64_t n, char **suffix); 28 | int poke_cmsg(struct msghdr *msgh, int single_ctrl_sz); 29 | int set_timstamp_cmsg(struct msghdr *msgh, int single_ctrl_sz); 30 | int print_cmsg(struct msghdr *msgh); 31 | int get_tx_timespec(struct msghdr *msgh, struct timespec *tx, int *type, int *pkt_idx); 32 | int get_rx_timespec(struct msghdr *msgh, struct timespec *tx); 33 | 34 | int gro_to_packets(int gro_sz, int bytes); 35 | int fprintf_hex(FILE *out, char *desc, void *addr, int len); 36 | int mirror_packet(uint8_t *data, int data_len, uint16_t port); 37 | 38 | size_t snprintfcat(char *buf, size_t size, char const *fmt, ...); 39 | void bump_memlock_rlimit(void); 40 | 41 | /* net.c */ 42 | socklen_t net_ss_size(struct sockaddr_storage *ss); 43 | int net_get_port(struct sockaddr_storage *ss); 44 | const char *net_ss_ntop(struct sockaddr_storage *ss, int show_port); 45 | int net_parse_sockaddr(struct sockaddr_storage *ss, const char *addr, int default_port); 46 | 47 | int net_gethostbyname(struct sockaddr_storage *ss, const char *host, int port, 48 | int force_family); 49 | 50 | /* pidfd.c */ 51 | int pidfd_open(pid_t pid, unsigned int flags); 52 | int pidfd_find_socket(int pidfd, int max_continous_gap, int type, int protocol, 53 | struct sockaddr_storage *addr, uint64_t cookie); 54 | 55 | /* uspace.c */ 56 | struct reuseport_storage; 57 | void cookies_find_empty(struct reuseport_storage *state, int gen, int sockhash_fd, 58 | uint64_t cookie, int *prev_pos, int *free_pos, int *gen_len); 59 | 60 | struct msg_value; 61 | void run_cb_update_map(struct msg_value *msg); 62 | struct reuseport_storage_key; 63 | void metric_incr_critical(const struct reuseport_storage_key *skey, int counter, 64 | int gauge); 65 | 66 | struct bpf_map_info; 67 | int map_from_prog(int prog_fd, char *map_name, struct bpf_map_info *user_map_info); 68 | struct sockaddr_storage; 69 | void skey_from_ss(struct reuseport_storage_key *skey, struct sockaddr_storage *ss); 70 | 71 | int *map_by_name(char *map_name, uint32_t skip_id); 72 | uint32_t map_fd_to_id(int map_fd); 73 | 74 | /* cgroup.c */ 75 | struct bpf_prog_info; 76 | int prog_from_cgroup(int cg_fd, int prog_type, char *prog_info_name, 77 | struct bpf_prog_info *user_info); 78 | int cgroup_from_paths(char **cgroup_paths, char **selected_cgroup_path, int cgroup_self); 79 | void cleanup_bpf_pin_dir(char *bpf_pin_dir); 80 | 81 | /* tubular.c */ 82 | int tubular_maybe_preserve_fd(struct reuseport_storage *state, int gen, int gen_len, 83 | int free_pos, int f); 84 | int tubular_maybe_register(struct reuseport_storage *state, int wg, char *tubular_path); 85 | void reuseport_groups_maybe_cleanup_stale(); 86 | int reuseport_groups_empty(); 87 | 88 | /* do_list.c */ 89 | char *key_to_str(struct reuseport_storage_key *key); 90 | void do_list(int prog_fd, int map_fd, struct sockaddr_storage *reuseport_ss, int verbose); 91 | 92 | /* do_flows.c */ 93 | void do_flows(int prog_fd, int map_fd, struct sockaddr_storage *reuseport_ss, 94 | int verbose); 95 | 96 | /* metrics.c */ 97 | void init_metrics(int prog_fd); 98 | metrics_t get_metrics(int prog_fd); 99 | void do_metrics(int prog_fd, int map_fd); 100 | -------------------------------------------------------------------------------- /src/do_flows.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "list.h" 14 | 15 | #include "common.h" 16 | 17 | #define TIMESPEC_NSEC(ts) ((ts)->tv_sec * 1000000000ULL + (ts)->tv_nsec) 18 | 19 | struct item { 20 | struct hlist_node in_list; 21 | struct lru_key key; 22 | struct lru_value value; 23 | }; 24 | 25 | #define FLOWS_MAX 256 26 | 27 | static void _do_flows(int prog_fd, struct reuseport_storage_key *_key, 28 | struct reuseport_storage *s, int verbose, struct hlist_head *flows) 29 | { 30 | uint32_t flow_entry_timeout_sec = s->dis.flow_entry_timeout_sec; 31 | if (flow_entry_timeout_sec == 0) 32 | flow_entry_timeout_sec = FLOW_DEFAULT_TIMEOUT_SEC; 33 | 34 | char *t = key_to_str(_key); 35 | printf("%s\n", t); 36 | int sockhash_fd = map_from_prog(prog_fd, "sockhash", NULL); 37 | if (sockhash_fd < 0) 38 | error(-1, errno, "map_from_prog()"); 39 | 40 | struct timespec ts; 41 | clock_gettime(CLOCK_MONOTONIC, &ts); 42 | uint64_t now = TIMESPEC_NSEC(&ts); 43 | 44 | int gen; 45 | uint32_t j; 46 | for (gen = 0; gen < MAX_GENS; gen++) { 47 | for (j = 0; j < s->max_idx[gen]; j++) { 48 | uint64_t c = s->cookies[gen][j]; 49 | if (c == 0) 50 | break; 51 | 52 | uint64_t v = 0; 53 | int r = bpf_map_lookup_elem(sockhash_fd, &c, &v); 54 | if (r == -1 || v == 0) { 55 | // nonexistent socket. 56 | continue; 57 | } 58 | 59 | int first = 0; 60 | struct hlist_node *pos; 61 | hlist_for_each(pos, &flows[c % FLOWS_MAX]) 62 | { 63 | struct item *item = 64 | hlist_entry(pos, struct item, in_list); 65 | if (item->value.cookie == c) { 66 | uint64_t age_ns = (now - item->value.last_tx_ns); 67 | double age_s = age_ns / 1000000000.; 68 | int stale = age_s > flow_entry_timeout_sec; 69 | 70 | if (stale == 0 || verbose > 0) { 71 | if (first == 0) { 72 | printf("\tso_cookie 0x%lx\n", c); 73 | first = 1; 74 | } 75 | 76 | printf("\t\t%08x age %.1fs %s", 77 | item->key.rx_hash, age_s, 78 | stale ? "(stale)" : ""); 79 | printf("\n"); 80 | } 81 | } 82 | } 83 | } 84 | } 85 | } 86 | 87 | void do_flows(int prog_fd, int map_fd, struct sockaddr_storage *reuseport_ss, int verbose) 88 | { 89 | /* Indexed by socket cookie */ 90 | struct hlist_head flows[FLOWS_MAX] = {[0 ...(FLOWS_MAX - 1)] = HLIST_HEAD_INIT}; 91 | 92 | { 93 | int lru_fd = map_from_prog(prog_fd, "lru_map", NULL); 94 | if (lru_fd < 0) 95 | error(-1, errno, "map_from_prog()"); 96 | 97 | struct lru_key key = {}; 98 | int err = bpf_map_get_next_key(lru_fd, NULL, &key); 99 | while (!err) { 100 | struct lru_value value = {}; 101 | int r = bpf_map_lookup_elem(lru_fd, &key, &value); 102 | if (r == 0) { 103 | /* Impossible to know if it's stale at this point */ 104 | struct item *i = 105 | (struct item *)calloc(1, sizeof(struct item)); 106 | i->key = key; 107 | i->value = value; 108 | hlist_add_head(&i->in_list, 109 | &flows[value.cookie % FLOWS_MAX]); 110 | } 111 | err = bpf_map_get_next_key(lru_fd, &key, &key); 112 | } 113 | } 114 | 115 | struct reuseport_storage_key key = {}; 116 | int err = 0; 117 | if (reuseport_ss->ss_family != AF_UNSPEC) { 118 | skey_from_ss(&key, reuseport_ss); 119 | } else { 120 | bpf_map_get_next_key(map_fd, NULL, &key); 121 | } 122 | while (!err) { 123 | struct reuseport_storage s = {}; 124 | int r = bpf_map_lookup_elem(map_fd, &key, &s); 125 | if (r == 0) { 126 | _do_flows(prog_fd, &key, &s, verbose, flows); 127 | } 128 | 129 | if (reuseport_ss->ss_family != AF_UNSPEC) { 130 | // finish loop 131 | break; 132 | } 133 | 134 | err = bpf_map_get_next_key(map_fd, &key, &key); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /tools/mmdecoy.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #define _GNU_SOURCE // clone 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | static sigset_t default_sigmask; 19 | /* Set in clone_child only, contains child's own pid */ 20 | static pid_t child_tid; 21 | 22 | #define PID_ENV "LISTEN_PID" 23 | 24 | struct execve_params { 25 | char *pathname; 26 | char **argv; 27 | char **envp; 28 | }; 29 | 30 | static int clone_child(void *arg) 31 | { 32 | /* copy to local stack */ 33 | struct execve_params e = *(struct execve_params *)arg; 34 | char listen_pid_var[32]; 35 | 36 | sigprocmask(SIG_SETMASK, &default_sigmask, NULL); 37 | 38 | /* if systemd provides notify socket, fix LISTEN_PID for child */ 39 | for (char **env = e.envp; *env != NULL; env++) { 40 | if (strncmp(PID_ENV, *env, strlen(PID_ENV)) == 0) { 41 | snprintf(&listen_pid_var[0], sizeof(listen_pid_var), 42 | PID_ENV "=%" PRIu32, child_tid); 43 | *env = &listen_pid_var[0]; 44 | break; 45 | } 46 | } 47 | 48 | execve(e.pathname, e.argv, e.envp); 49 | /* execve() should never exit */ 50 | error(-1, errno, "execve(%s)", e.pathname); 51 | abort(); 52 | } 53 | 54 | static pid_t fork_execve_pidfd(int *child_pidfd, char **argv, char **envp) 55 | { 56 | char stack[1024]; 57 | char *stack_top = stack + sizeof(stack); 58 | 59 | struct execve_params e = {.pathname = argv[0], .argv = argv, .envp = envp}; 60 | 61 | int pidfd = -1; 62 | int pid = clone(clone_child, stack_top - ((long)stack_top & 0xf), 63 | CLONE_PIDFD | CLONE_CHILD_SETTID | SIGCHLD, &e, &pidfd, NULL, 64 | &child_tid); 65 | if (pid <= 0) { 66 | return pid; 67 | } 68 | 69 | if (child_pidfd) { 70 | *child_pidfd = pidfd; 71 | } else { 72 | close(pidfd); 73 | } 74 | return pid; 75 | } 76 | 77 | static int pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) 78 | { 79 | return syscall(SYS_pidfd_send_signal, pidfd, sig, info, flags); 80 | } 81 | 82 | int main(int argc, char **argv, char **envp) 83 | { 84 | (void)argc; 85 | sigset_t mask; 86 | 87 | /* All signals */ 88 | sigfillset(&mask); 89 | 90 | int sfd = signalfd(-1, &mask, SFD_CLOEXEC); 91 | if (sfd == -1) { 92 | error(-1, errno, "signalfd()"); 93 | } 94 | 95 | /* Block signals so that they aren't handled 96 | according to their default dispositions */ 97 | if (sigprocmask(SIG_BLOCK, &mask, &default_sigmask) == -1) { 98 | error(-1, errno, "sigprocmask()"); 99 | } 100 | 101 | char **child_argv = &argv[1]; 102 | 103 | if (*child_argv == NULL) { 104 | error(-1, 0, "Usage: %s -- COMMAND [ARGS]", argv[0]); 105 | } 106 | 107 | if (strcmp(child_argv[0], "--") == 0) { 108 | child_argv = &argv[2]; 109 | } 110 | 111 | int child_pidfd = -1; 112 | pid_t child_pid = fork_execve_pidfd(&child_pidfd, child_argv, envp); 113 | if (child_pid < 1) { 114 | error(-1, errno, "clone3()"); 115 | } 116 | 117 | while (1) { 118 | struct signalfd_siginfo fdsi; 119 | ssize_t s = read(sfd, &fdsi, sizeof(fdsi)); 120 | if (s != sizeof(fdsi)) 121 | error(-1, EMSGSIZE, "read(signalfd)"); 122 | 123 | if (fdsi.ssi_pid == (uint32_t)child_pid && fdsi.ssi_signo == SIGCHLD) { 124 | /* Child had died. Copy its status. */ 125 | exit(fdsi.ssi_status); 126 | } 127 | 128 | if (fdsi.ssi_pid == (uint32_t)child_pid && fdsi.ssi_signo == SIGURG) { 129 | /* SIGURG from child means we should exit, 130 | it entered graceful shutdown mode */ 131 | exit(0); 132 | } 133 | 134 | /* Signals from parent always forward */ 135 | int r = pidfd_send_signal(child_pidfd, fdsi.ssi_signo, NULL, 0); 136 | if (r != 0) { 137 | /* If signal send fails, just ignore and wait for SIGCHLD. */ 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/cgroup.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | int prog_from_cgroup(int cg_fd, int prog_type, char *prog_info_name, 19 | struct bpf_prog_info *user_info) 20 | { 21 | uint32_t prog_ids[128]; 22 | uint32_t prog_ids_sz = 128; 23 | int r = bpf_prog_query(cg_fd, prog_type, 0, NULL, prog_ids, &prog_ids_sz); 24 | if (r != 0) 25 | error(-1, errno, "bpf_prog_query(cgroup)"); 26 | 27 | int i; 28 | for (i = 0; i < (int)prog_ids_sz; i++) { 29 | struct bpf_prog_info info = {}; 30 | if (user_info) // needed for prog.map_ids 31 | info = *user_info; 32 | uint32_t info_len = sizeof(info); 33 | int prog_fd = bpf_prog_get_fd_by_id(prog_ids[i]); 34 | if (prog_fd < 0) 35 | error(-1, errno, "bpf_prog_get_fd_by_id"); 36 | r = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); 37 | if (r != 0) 38 | error(-1, errno, "bpf_prog_get_info_by_fd"); 39 | if (strncmp(info.name, prog_info_name, BPF_OBJ_NAME_LEN - 1) == 0) { 40 | if (user_info) 41 | *user_info = info; 42 | return prog_fd; 43 | } 44 | close(prog_fd); 45 | } 46 | return -1; 47 | } 48 | 49 | static char *get_first_proc_self_cgroup() 50 | { 51 | char line[PATH_MAX * 2] = {}; 52 | char *cgroup_path = NULL; 53 | 54 | FILE *file = fopen("/proc/self/cgroup", "r"); 55 | if (file == NULL) { 56 | error(-1, errno, "fopen"); 57 | } 58 | 59 | while (fgets(line, sizeof(line), file)) { 60 | char *path = strchr(line, '/'); 61 | if (path) { 62 | while (path && strlen(path) > 1 && 63 | path[strlen(path) - 1] == '\n') { 64 | path[strlen(path) - 1] = '\x00'; 65 | } 66 | cgroup_path = strdup(path); 67 | break; 68 | } 69 | } 70 | 71 | fclose(file); 72 | return cgroup_path; 73 | } 74 | 75 | int cgroup_from_paths(char **cgroup_paths, char **selected_cgroup_path, int cgroup_self) 76 | { 77 | char *main_cgroup_path = NULL; 78 | int cg_fd = -1; 79 | int i; 80 | for (i = 0; cgroup_paths[i] != NULL; i++) { 81 | cg_fd = open(cgroup_paths[i], O_DIRECTORY | O_RDONLY); 82 | if (cg_fd >= 0) { 83 | main_cgroup_path = cgroup_paths[i]; 84 | goto maybe_self; 85 | } 86 | } 87 | return -1; 88 | 89 | maybe_self: 90 | if (cgroup_self == 0) { 91 | if (selected_cgroup_path) 92 | *selected_cgroup_path = strdup(main_cgroup_path); 93 | return cg_fd; 94 | } 95 | close(cg_fd); 96 | char path[PATH_MAX] = {}; 97 | char *child_cgroup = get_first_proc_self_cgroup(); 98 | if (child_cgroup == NULL) { 99 | error(-1, ENOENT, "open(/proc/self/cgroup) parsing failed"); 100 | } 101 | snprintf(path, sizeof(path), "%s%s", main_cgroup_path, child_cgroup); 102 | free(child_cgroup); 103 | cg_fd = open(path, O_DIRECTORY | O_RDONLY); 104 | if (cg_fd >= 0) { 105 | if (selected_cgroup_path) 106 | *selected_cgroup_path = strdup(path); 107 | return cg_fd; 108 | } 109 | error(0, ENOENT, "open(%s) failed", path); 110 | return -1; 111 | } 112 | 113 | void cleanup_bpf_pin_dir(char *bpf_pin_dir) 114 | { 115 | /* Generally don't error on directory cleanup errors */ 116 | DIR *const directory = opendir(bpf_pin_dir); 117 | if (directory) { 118 | struct dirent *entry; 119 | while ((entry = readdir(directory))) { 120 | if (entry->d_name[0] == '.') { 121 | continue; 122 | } 123 | /* We must explicitly detach the link, 124 | * otherwise the detach is done by 125 | * some delayed GC in the kernel and 126 | * takes some time after program exit 127 | * causing races in tests. */ 128 | char b[PATH_MAX]; 129 | snprintf(b, sizeof(b), "%s/%s", bpf_pin_dir, entry->d_name); 130 | int fd = bpf_obj_get(b); 131 | if (fd >= 0) { 132 | bpf_link_detach(fd); 133 | close(fd); 134 | } 135 | 136 | unlinkat(dirfd(directory), entry->d_name, 0); 137 | } 138 | closedir(directory); 139 | } 140 | 141 | rmdir(bpf_pin_dir); 142 | } 143 | -------------------------------------------------------------------------------- /tests/test_dissector_quic.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | import struct 6 | 7 | from . import base 8 | from .lsocket import * 9 | 10 | 11 | class DissectorQuic(base.TestCase): 12 | ''' 13 | ldb [0] 14 | and #0x80 15 | jeq #0x80, long_form, short_form 16 | 17 | long_form: 18 | ldb [5] 19 | jneq #16, bad_length 20 | ldx #6 21 | jmp parse_dcid 22 | 23 | bad_length: 24 | ret #-1 25 | 26 | short_form: 27 | ldx #1 28 | jmp parse_dcid 29 | 30 | parse_dcid: 31 | ldh [x + 0] 32 | ret a 33 | 34 | ''' 35 | test_quic_cbpf = [ 36 | (0x30, 0, 0, 0000000000), 37 | (0x54, 0, 0, 0x00000080), 38 | (0x15, 0, 5, 0x00000080), 39 | (0x30, 0, 0, 0x00000005), 40 | (0x15, 0, 2, 0x00000010), 41 | (0x01, 0, 0, 0x00000006), 42 | (0x05, 0, 0, 0x00000003), 43 | (0x06, 0, 0, 0xffffffff), 44 | (0x01, 0, 0, 0x00000001), 45 | (0x05, 0, 0, 0000000000), 46 | (0x48, 0, 0, 0000000000), 47 | (0x16, 0, 0, 0000000000), 48 | ] 49 | 50 | def test_quic_not_quic_hdr(self): 51 | p = self.udpgrm_run("--daemon --install") 52 | self.assertTrue(p.collect_stderr("Tailing")[0]) 53 | sa, port = self.bind() 54 | 55 | cbpf = self.test_quic_cbpf 56 | v = struct.pack("IIII100sI256s", DISSECTOR_CBPF, 57 | 124, 0, 0, b'', 58 | len(cbpf), b''.join(struct.pack('HBBI', *sf) for sf in cbpf)) 59 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_DISSECTOR, v) 60 | 61 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 0) 62 | self.sync_socket_gen(sa) 63 | self.assertTrue(p.collect_stdout("socket found")[0]) 64 | 65 | v = sa.getsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 12) 66 | self.assertEqual(v, struct.pack('III', 0, 0, 0x80)) 67 | quic_cookie = struct.unpack('IIHH', v)[2] 68 | 69 | # Too short packet for Quic -> _error 70 | old = self.socket() 71 | old.connect(('127.0.0.1', port)) 72 | old.send(b'\xffhell') 73 | self.assertEqual(sa.echo(), b'\xffhell') 74 | 75 | D, M = self.metrics_delta({}) 76 | # TEST: D == {} 77 | self.assertEqual(D, {'rx_processed_total': 1, 78 | 'rx_packet_too_short_error': 1}) 79 | 80 | NEW_FLOW_COUNTERS = {'rx_processed_total': 1, 81 | 'rx_new_flow_total': 1, 82 | 'rx_new_flow_working_gen_dispatch_ok': 1, 83 | 'rx_dissected_ok_total': 1, 84 | 'rx_flow_new_unseen': 1} 85 | 86 | VALID_RX_FLOW_COUNTERS = {'rx_processed_total': 1, 87 | 'rx_dissected_ok_total': 1, 88 | 'rx_flow_ok': 1} 89 | 90 | # Short QUIC packet, wrong quic_cookie data -> new flow 91 | old.send(b'\x01'*128) 92 | self.assertEqual(sa.echo(), b'\x01'*128) 93 | D, M = self.metrics_delta(M) 94 | self.assertEqual(D, NEW_FLOW_COUNTERS | {'rx_flow_new_bad_cookie': 1}) 95 | 96 | # Short QUIC packet, correct quic_cookie data -> just RX 97 | v = struct.pack(">BHHIII", 0x00, quic_cookie, 0, 0, 0, 0) 98 | old.send(v) 99 | self.assertEqual(sa.echo(), v) 100 | D, M = self.metrics_delta(M) 101 | self.assertEqual(D, VALID_RX_FLOW_COUNTERS) 102 | 103 | # Long packet -> new flow 104 | old.send(b'\xff'*128) 105 | self.assertEqual(sa.echo(), b'\xff'*128) 106 | D, M = self.metrics_delta(M) 107 | self.assertEqual(D, NEW_FLOW_COUNTERS) 108 | 109 | # Valid long packet -> just as RX 110 | # type, version, dcid_len, dcid 111 | v = struct.pack(">BIBHHIII", 0x80, 1, 16, 112 | quic_cookie, 0, 0, 0, 0) 113 | old.send(v) 114 | self.assertTrue(sa.recv(99)) 115 | D, M = self.metrics_delta(M) 116 | self.assertEqual(D, VALID_RX_FLOW_COUNTERS) 117 | 118 | # invalid long packet -> new flow 119 | v = struct.pack(">BIBIIII", 0x80, 1, 16, 0x000f0, 0, 0, 0) 120 | old.send(v) 121 | self.assertTrue(sa.recv(99)) 122 | D, M = self.metrics_delta(M) 123 | self.assertEqual(D, NEW_FLOW_COUNTERS | {'rx_flow_new_bad_cookie': 1}) 124 | -------------------------------------------------------------------------------- /ebpf/ebpf_sha256.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0-only 2 | // Copyright (c) 2025 Cloudflare, Inc. 3 | // Licensed under the GNU General Public License Version 2 found in the ebpf/LICENSE file or at: 4 | // https://opensource.org/license/gpl-2-0 5 | 6 | struct sha256_buff { 7 | uint32_t h[8]; 8 | }; 9 | 10 | static void sha256_init(struct sha256_buff *buff) 11 | { 12 | buff->h[0] = 0x6a09e667; 13 | buff->h[1] = 0xbb67ae85; 14 | buff->h[2] = 0x3c6ef372; 15 | buff->h[3] = 0xa54ff53a; 16 | buff->h[4] = 0x510e527f; 17 | buff->h[5] = 0x9b05688c; 18 | buff->h[6] = 0x1f83d9ab; 19 | buff->h[7] = 0x5be0cd19; 20 | } 21 | 22 | static const uint32_t k[64] = { 23 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 24 | 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 25 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 26 | 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 27 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 28 | 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 29 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 30 | 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 31 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 32 | 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 33 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2}; 34 | 35 | #define ROTATE_R(val, bits) ((val) >> (bits) | (val) << (32 - (bits))) 36 | 37 | /* ctx is changed, b is not changed */ 38 | /* must be noinline */ 39 | __attribute__((noinline)) int sha256_calc_chunk(struct scratch *restrict scratch, 40 | struct sha256_buff *restrict ctx, 41 | struct sha256_block *restrict b) 42 | { 43 | if (ctx == NULL || b == NULL || scratch == NULL) 44 | return -1; 45 | uint32_t *chunk = (uint32_t *)b->u8; 46 | uint32_t *w = scratch->w; 47 | uint32_t *tv = scratch->tv; 48 | 49 | uint32_t i; 50 | 51 | #pragma unroll 52 | for (i = 0; i < 16; ++i) { 53 | w[i] = bswap32(chunk[i]); 54 | } 55 | 56 | #pragma unroll 57 | for (i = 16; i < 64; ++i) { 58 | uint32_t s0 = ROTATE_R(w[i - 15], 7) ^ ROTATE_R(w[i - 15], 18) ^ 59 | (w[i - 15] >> 3); 60 | uint32_t s1 = ROTATE_R(w[i - 2], 17) ^ ROTATE_R(w[i - 2], 19) ^ 61 | (w[i - 2] >> 10); 62 | w[i] = w[i - 16] + s0 + w[i - 7] + s1; 63 | } 64 | 65 | #pragma unroll 66 | for (i = 0; i < 8; ++i) 67 | tv[i] = ctx->h[i]; 68 | 69 | // bumps stack with unroll 70 | #pragma nounroll 71 | for (i = 0; i < 64; ++i) { 72 | uint32_t S1 = 73 | ROTATE_R(tv[4], 6) ^ ROTATE_R(tv[4], 11) ^ ROTATE_R(tv[4], 25); 74 | uint32_t ch = (tv[4] & tv[5]) ^ (~tv[4] & tv[6]); 75 | uint32_t temp1 = tv[7] + S1 + ch + k[i] + w[i]; 76 | uint32_t S0 = 77 | ROTATE_R(tv[0], 2) ^ ROTATE_R(tv[0], 13) ^ ROTATE_R(tv[0], 22); 78 | uint32_t maj = (tv[0] & tv[1]) ^ (tv[0] & tv[2]) ^ (tv[1] & tv[2]); 79 | uint32_t temp2 = S0 + maj; 80 | 81 | tv[7] = tv[6]; 82 | tv[6] = tv[5]; 83 | tv[5] = tv[4]; 84 | tv[4] = tv[3] + temp1; 85 | tv[3] = tv[2]; 86 | tv[2] = tv[1]; 87 | tv[1] = tv[0]; 88 | tv[0] = temp1 + temp2; 89 | } 90 | 91 | #pragma unroll 92 | for (i = 0; i < 8; ++i) 93 | ctx->h[i] += tv[i]; 94 | return 0; 95 | } 96 | 97 | /* overwrites data */ 98 | static int sha256_final(struct scratch *scratch, struct sha256_buff *ctx, 99 | struct sha256_block *data, size_t data_len, int total_len) 100 | { 101 | if (data == NULL) 102 | return -1; 103 | if (data_len < 0 || data_len > 55) { 104 | // NOT IMPL 105 | return -1; 106 | } 107 | data->u8[data_len] = 0x80; 108 | data_len += 1; 109 | 110 | uint64_t size = total_len * 8; 111 | *(uint64_t *)&data->u8[56] = bswap64(size); 112 | 113 | sha256_calc_chunk(scratch, ctx, data); 114 | return 0; 115 | } 116 | 117 | static void sha256_read(const struct sha256_buff *buff, uint8_t *hash) 118 | { 119 | uint32_t i; 120 | uint32_t *h = (uint32_t *)hash; 121 | for (i = 0; i < 8; i++) { 122 | h[i] = bswap32(buff->h[i]); 123 | } 124 | } 125 | 126 | /* overwrites key and data */ 127 | /* Must be noinline. */ 128 | __attribute__((noinline)) int sha256_hmac(struct scratch *scratch, 129 | struct sha256_block *key, 130 | struct sha256_block *data, size_t data_len) 131 | { 132 | if (key == NULL || data == NULL) 133 | return -1; 134 | 135 | size_t i; 136 | 137 | struct sha256_buff ctx; 138 | sha256_init(&ctx); 139 | 140 | // k_ipad 141 | for (i = 0; i < 8; i++) 142 | key->u64[i] ^= 0x3636363636363636ULL; 143 | sha256_calc_chunk(scratch, &ctx, key); 144 | 145 | // clear k_ipad, set k_opad 146 | for (i = 0; i < 8; i++) 147 | key->u64[i] ^= 0x3636363636363636ULL ^ 0x5c5c5c5c5c5c5c5cULL; 148 | 149 | sha256_final(scratch, &ctx, data, data_len, data_len + 64); 150 | memset(data->u8, 0, 64); 151 | sha256_read(&ctx, data->u8); 152 | 153 | sha256_init(&ctx); 154 | 155 | // already k_opad 156 | sha256_calc_chunk(scratch, &ctx, key); 157 | // clear k_opad 158 | for (i = 0; i < 8; i++) 159 | key->u64[i] ^= 0x5c5c5c5c5c5c5c5cULL; 160 | sha256_final(scratch, &ctx, data, 32, 32 + 64); 161 | memset(data->u8, 0, 64); 162 | sha256_read(&ctx, data->u8); 163 | return 0; 164 | } 165 | -------------------------------------------------------------------------------- /src/net.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include "common.h" 15 | 16 | #ifndef UNIX_PATH_MAX 17 | #define UNIX_PATH_MAX 108 18 | #endif 19 | 20 | socklen_t net_ss_size(struct sockaddr_storage *ss) 21 | { 22 | switch (ss->ss_family) { 23 | case AF_INET: 24 | return sizeof(struct sockaddr_in); 25 | case AF_INET6: 26 | return sizeof(struct sockaddr_in6); 27 | case AF_UNIX: { 28 | struct sockaddr_un *sun = (struct sockaddr_un *)(ss); 29 | socklen_t l = __builtin_offsetof(struct sockaddr_un, sun_path); 30 | if (sun->sun_path[0] != '\x00') { 31 | l += strnlen(sun->sun_path, UNIX_PATH_MAX); 32 | } else { 33 | l += 1 + strnlen(&sun->sun_path[1], UNIX_PATH_MAX - 1); 34 | } 35 | return l; 36 | } 37 | } 38 | return sizeof(struct sockaddr_storage); 39 | } 40 | 41 | int net_get_port(struct sockaddr_storage *ss) 42 | { 43 | switch (ss->ss_family) { 44 | case AF_INET: { 45 | struct sockaddr_in *sin = (struct sockaddr_in *)ss; 46 | return ntohs(sin->sin_port); 47 | } 48 | case AF_INET6: { 49 | struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss; 50 | return ntohs(sin6->sin6_port); 51 | } 52 | } 53 | return -1; 54 | } 55 | 56 | const char *net_ss_ntop(struct sockaddr_storage *ss, int show_port) 57 | { 58 | char s[sizeof(struct sockaddr_storage) + 1]; 59 | static char a[sizeof(struct sockaddr_storage) + 32]; 60 | const char *r; 61 | switch (ss->ss_family) { 62 | case AF_INET: { 63 | struct sockaddr_in *sin = (struct sockaddr_in *)ss; 64 | r = inet_ntop(sin->sin_family, &sin->sin_addr, s, sizeof(s)); 65 | if (r == NULL) { 66 | error(-1, errno, "inet_ntop()"); 67 | } 68 | if (show_port == 0) { 69 | snprintf(a, sizeof(a), "%s", s); 70 | } else { 71 | int port = htons(sin->sin_port); 72 | snprintf(a, sizeof(a), "%s:%d", s, port); 73 | } 74 | break; 75 | } 76 | case AF_INET6: { 77 | struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss; 78 | r = inet_ntop(sin6->sin6_family, &sin6->sin6_addr, s, sizeof(s)); 79 | if (r == NULL) { 80 | error(-1, errno, "inet_ntop()"); 81 | } 82 | if (show_port == 0) { 83 | snprintf(a, sizeof(a), "%s", s); 84 | } else { 85 | int port = htons(sin6->sin6_port); 86 | snprintf(a, sizeof(a), "[%s]:%d", s, port); 87 | } 88 | break; 89 | } 90 | case AF_UNIX: { 91 | struct sockaddr_un *sun = (struct sockaddr_un *)ss; 92 | memcpy(s, sun->sun_path, sizeof(sun->sun_path)); 93 | s[sizeof(sun->sun_path)] = '\x00'; 94 | if (s[0] == '\x00') { 95 | s[0] = '@'; 96 | } 97 | snprintf(a, sizeof(a), "%s", s); 98 | break; 99 | } 100 | default: 101 | error(-1, 0, "Unknown ss family %d", ss->ss_family); 102 | } 103 | return a; 104 | } 105 | 106 | int net_parse_sockaddr(struct sockaddr_storage *ss, const char *addr, int default_port) 107 | { 108 | int force_family = 0; 109 | char host[256]; 110 | strncpy(host, addr, sizeof(host)); 111 | host[sizeof(host) - 1] = '\x00'; 112 | 113 | // Try v6: 114 | int r = net_gethostbyname(ss, host, default_port, AF_INET6); 115 | if (r >= 0) { 116 | return ss->ss_family; 117 | } 118 | 119 | long port = 0; 120 | char *colon = strrchr(addr, ':'); 121 | if (colon == NULL || colon[1] == '\0') { 122 | port = default_port; 123 | colon = NULL; 124 | } else { 125 | char *endptr; 126 | port = strtol(&colon[1], &endptr, 10); 127 | if (port < 0 || port > 65535 || *endptr != '\0') { 128 | port = default_port; 129 | colon = NULL; 130 | } 131 | } 132 | 133 | // Cut at colon 134 | if (colon) { 135 | int addr_len = colon - addr >= (int)(sizeof host) ? (int)sizeof(host) - 1 136 | : colon - addr; 137 | host[addr_len] = '\0'; 138 | } 139 | if (host[0] == '[' && host[strlen(host) - 1] == ']') { 140 | force_family = AF_INET6; 141 | host[strlen(host) - 1] = '\x00'; 142 | memmove(host, &host[1], strlen(&host[1]) + 1); 143 | } 144 | 145 | return net_gethostbyname(ss, host, port, force_family); 146 | } 147 | 148 | int net_gethostbyname(struct sockaddr_storage *ss, const char *host, int port, 149 | int force_family) 150 | { 151 | memset(ss, 0, sizeof(struct sockaddr_storage)); 152 | 153 | struct in_addr in_addr; 154 | struct in6_addr in6_addr; 155 | 156 | /* Try ipv4 address first */ 157 | if ((force_family == 0 || force_family == AF_INET) && 158 | inet_pton(AF_INET, host, &in_addr) == 1) { 159 | struct sockaddr_in *sin4 = (struct sockaddr_in *)ss; 160 | *sin4 = (struct sockaddr_in){.sin_family = AF_INET, 161 | .sin_port = htons(port), 162 | .sin_addr = in_addr}; 163 | return AF_INET; 164 | } 165 | 166 | /* Then ipv6 */ 167 | if ((force_family == 0 || force_family == AF_INET6) && 168 | inet_pton(AF_INET6, host, &in6_addr) == 1) { 169 | struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss; 170 | *sin6 = (struct sockaddr_in6){.sin6_family = AF_INET6, 171 | .sin6_port = htons(port), 172 | .sin6_addr = in6_addr}; 173 | return AF_INET6; 174 | } 175 | 176 | #if 0 177 | /* Then assume unix socket path */ 178 | if ((force_family == 0 || force_family == AF_UNIX)) { 179 | struct sockaddr_un *sun = (struct sockaddr_un *)ss; 180 | sun->sun_family = AF_UNIX; 181 | strncpy(sun->sun_path, host, UNIX_PATH_MAX); 182 | // Linux abstract sockets often use @ for zero 183 | if (sun->sun_path[0] == '@') { 184 | sun->sun_path[0] = '\x00'; 185 | } 186 | return AF_UNIX; 187 | } 188 | #endif 189 | 190 | // error(-1, errno, "inet_pton(\"%s\")", host); 191 | return -1; 192 | } 193 | -------------------------------------------------------------------------------- /src/uspace.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "common.h" 15 | 16 | #include "../ebpf.skel.h" 17 | 18 | extern struct ebpf *skel; 19 | 20 | void cookies_find_empty(struct reuseport_storage *state, int gen, int sockhash_fd, 21 | uint64_t cookie, int *prev_pos, int *free_pos, int *gen_len) 22 | { 23 | *free_pos = -1; 24 | *prev_pos = -1; 25 | *gen_len = 0; 26 | int i, r; 27 | for (i = 0; i < MAX_SOCKETS_IN_GEN; i++) { 28 | uint64_t uc = state->cookies[gen % MAX_GENS][i]; 29 | 30 | uint64_t v = 0; 31 | /* Is the cookie pointing to live socket? */ 32 | r = bpf_map_lookup_elem(sockhash_fd, &uc, &v); 33 | if (!(r != 0 || v == 0)) { 34 | /* r != 0 means entry unset (not stale, not 35 | * good). v == 0 means stale. Socket exists. 36 | */ 37 | if (v == cookie && *prev_pos == -1) { 38 | *prev_pos = i; 39 | } 40 | *gen_len = i + 1; 41 | continue; 42 | } 43 | 44 | /* Guaranteed empty slot */ 45 | if (*free_pos == -1) { 46 | *free_pos = i; 47 | *gen_len = i + 1; 48 | } 49 | } 50 | } 51 | 52 | void run_cb_update_map(struct msg_value *msg) 53 | { 54 | int fd = bpf_program__fd(skel->progs.udpgrm_cb_update_map); 55 | LIBBPF_OPTS(bpf_test_run_opts, topts); 56 | 57 | topts.data_in = msg; 58 | topts.data_size_in = sizeof(*msg); 59 | int err = bpf_prog_test_run_opts(fd, &topts); 60 | if (err != 0 || topts.retval != 0) { 61 | error(-1, errno, "Failed to call ebpf, err=%d retval=%d\n", err, 62 | topts.retval); 63 | } 64 | } 65 | 66 | /* Errors aren't critical. */ 67 | void metric_incr_critical(const struct reuseport_storage_key *skey, int counter, 68 | int gauge) 69 | { 70 | { 71 | struct msg_value msg; 72 | memset(&msg, 0, sizeof(msg)); 73 | msg = (struct msg_value){ 74 | .skey = *skey, 75 | .type = GSM_SET_SOCKET_CRITICAL_GAUGE, 76 | .value = gauge, 77 | }; 78 | run_cb_update_map(&msg); 79 | } 80 | 81 | if (counter) { 82 | struct msg_value msg; 83 | memset(&msg, 0, sizeof(msg)); 84 | msg = (struct msg_value){ 85 | .skey = *skey, 86 | .type = GSM_INCR_SOCKET_CRITICAL, 87 | .value = counter, 88 | }; 89 | run_cb_update_map(&msg); 90 | } 91 | } 92 | 93 | int map_from_prog(int prog_fd, char *map_name, struct bpf_map_info *user_map_info) 94 | { 95 | struct bpf_prog_info prog_info = {}; 96 | uint32_t *map_ids = calloc(128, sizeof(uint32_t)); 97 | prog_info.nr_map_ids = 128; 98 | prog_info.map_ids = (uint64_t)(uintptr_t)map_ids; 99 | 100 | uint32_t prog_info_len = sizeof(prog_info); 101 | int r = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); 102 | if (r != 0) { 103 | free(map_ids); 104 | // error(-1, errno, "bpf_prog_get_info_by_fd"); 105 | return -1; 106 | } 107 | 108 | int i; 109 | for (i = 0; i < (int)prog_info.nr_map_ids; i += 1) { 110 | int map_fd = bpf_map_get_fd_by_id(map_ids[i]); 111 | if (map_fd < 0) 112 | error(-1, errno, "bpf_map_get_fd_by_id"); 113 | 114 | struct bpf_map_info map_info = {}; 115 | uint32_t map_info_len = sizeof(map_info); 116 | int r = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len); 117 | if (r < 0) { 118 | free(map_ids); 119 | // error(-1, errno, "bpf_map_get_info_by_fd"); 120 | return -1; 121 | } 122 | if (strcmp(map_info.name, map_name) == 0) { 123 | free(map_ids); 124 | if (user_map_info) 125 | *user_map_info = map_info; 126 | return map_fd; 127 | } 128 | close(map_fd); 129 | } 130 | 131 | free(map_ids); 132 | return -1; 133 | } 134 | 135 | void skey_from_ss(struct reuseport_storage_key *skey, struct sockaddr_storage *ss) 136 | { 137 | *skey = (struct reuseport_storage_key){ 138 | .family = ss->ss_family, 139 | .src_port = net_get_port(ss), 140 | }; 141 | switch (ss->ss_family) { 142 | case AF_INET: { 143 | struct sockaddr_in *sin = (struct sockaddr_in *)ss; 144 | memcpy(&skey->src_ip4, &sin->sin_addr, 4); 145 | break; 146 | } 147 | case AF_INET6: { 148 | struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss; 149 | memcpy(&skey->src_ip6, &sin6->sin6_addr, 16); 150 | break; 151 | } 152 | } 153 | } 154 | 155 | int *map_by_name(char *map_name, uint32_t skip_id) 156 | { 157 | if (strlen(map_name) > BPF_OBJ_NAME_LEN) 158 | error(-1, -1, ""); 159 | 160 | static int map_fd_list[128]; 161 | uint32_t map_fd_cnt = 0; 162 | 163 | uint32_t next_id = 0; 164 | while (!bpf_map_get_next_id(next_id, &next_id)) { 165 | if (skip_id == next_id) { 166 | continue; 167 | } 168 | 169 | int fd = bpf_map_get_fd_by_id(next_id); 170 | if (fd < 0 && errno == ENOENT) { 171 | continue; 172 | } 173 | if (fd < 0) { 174 | error(-1, errno, "bpf_map_get_fd_by_id(name=%s)", map_name); 175 | } 176 | 177 | struct bpf_map_info map_info = {}; 178 | uint32_t info_len = sizeof(map_info); 179 | int r = bpf_obj_get_info_by_fd(fd, &map_info, &info_len); 180 | if (r != 0) { 181 | error(-1, errno, "bpf_obj_get_info_by_fd(name=%s)", map_name); 182 | } 183 | 184 | if (strncmp(map_name, map_info.name, BPF_OBJ_NAME_LEN) != 0) { 185 | close(fd); 186 | continue; 187 | } 188 | 189 | map_fd_list[map_fd_cnt++] = fd; 190 | if (map_fd_cnt >= ARRAY_SIZE(map_fd_list) - 1) { 191 | break; 192 | } 193 | } 194 | map_fd_list[map_fd_cnt] = -1; 195 | return map_fd_list; 196 | } 197 | 198 | uint32_t map_fd_to_id(int map_fd) 199 | { 200 | struct bpf_map_info map_info = {}; 201 | uint32_t info_len = sizeof(map_info); 202 | int r = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); 203 | if (r != 0) { 204 | error(-1, errno, "bpf_obj_get_info_by_fd()"); 205 | } 206 | 207 | return map_info.id; 208 | } 209 | -------------------------------------------------------------------------------- /ebpf/ebpf_internal.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0-only 2 | // Copyright (c) 2025 Cloudflare, Inc. 3 | // Licensed under the GNU General Public License Version 2 found in the ebpf/LICENSE file or at: 4 | // https://opensource.org/license/gpl-2-0 5 | 6 | #define SEC_TO_NSEC(v) ((v)*1000000000ULL) 7 | #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) 8 | 9 | #define MAX_REUSEPORT_GROUPS 512 10 | #define MAX_TOTAL_FLOWS 8192 11 | 12 | #define MIN(a, b) (((a) < (b)) ? (a) : (b)) 13 | #define MAX(a, b) (((a) > (b)) ? (a) : (b)) 14 | /* This is so hard. So. We want to avoid putting mess on stack, since 15 | * struct msg_value is large, reserve mem for it - on ringbuf. Only 16 | * then do snprintf, but we don't want to submit too large block to 17 | * ringbuf to avoid wasting space _there_. Threfore do 18 | * bpf_ringbuf_output, to copy the msg there and discard the original 19 | * allocation. Basically, we're using ringbuf as a malloc. Hurray. */ 20 | #define log_printf(fmt, args...) \ 21 | ({ \ 22 | static const char *___fmt = fmt; \ 23 | unsigned long long ___param[___bpf_narg(args)]; \ 24 | \ 25 | _Pragma("GCC diagnostic push") \ 26 | _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ 27 | ___bpf_fill(___param, args); \ 28 | _Pragma("GCC diagnostic pop") \ 29 | \ 30 | struct msg_value *e = bpf_ringbuf_reserve( \ 31 | &msg_rb, sizeof(struct msg_value), 0); \ 32 | if (e != NULL) { \ 33 | long l = bpf_snprintf(&e->log[0], sizeof(e->log), ___fmt, \ 34 | ___param, sizeof(___param)); \ 35 | unsigned ll = offsetof(struct msg_value, log) + l; \ 36 | if (ll > sizeof(struct msg_value)) \ 37 | ll = sizeof(struct msg_value); \ 38 | bpf_ringbuf_output(&msg_rb, e, ll, 0); \ 39 | bpf_ringbuf_discard(e, 0); \ 40 | } \ 41 | }) 42 | 43 | #define log_printfs(_skey, fmt, args...) \ 44 | ({ \ 45 | static const char *___fmt = fmt; \ 46 | unsigned long long ___param[___bpf_narg(args)]; \ 47 | \ 48 | _Pragma("GCC diagnostic push") \ 49 | _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ 50 | ___bpf_fill(___param, args); \ 51 | _Pragma("GCC diagnostic pop") \ 52 | \ 53 | struct msg_value *_e = bpf_ringbuf_reserve( \ 54 | &msg_rb, sizeof(struct msg_value), 0); \ 55 | if (_e != NULL) { \ 56 | _e->skey = *(_skey); \ 57 | long l = bpf_snprintf(&_e->log[0], sizeof(_e->log), ___fmt, \ 58 | ___param, sizeof(___param)); \ 59 | unsigned ll = offsetof(struct msg_value, log) + l; \ 60 | if (ll > sizeof(struct msg_value)) \ 61 | ll = sizeof(struct msg_value); \ 62 | bpf_ringbuf_output(&msg_rb, _e, ll, 0); \ 63 | bpf_ringbuf_discard(_e, 0); \ 64 | } \ 65 | }) 66 | 67 | /* Global metrics 68 | */ 69 | struct { 70 | __uint(type, BPF_MAP_TYPE_ARRAY); 71 | __uint(max_entries, 1); 72 | __type(key, unsigned int); 73 | __type(value, metrics_t); 74 | } metrics_map SEC(".maps"); 75 | 76 | struct { 77 | __uint(type, BPF_MAP_TYPE_RINGBUF); 78 | __uint(max_entries, 512 * 1024); 79 | } msg_rb SEC(".maps"); 80 | 81 | struct { 82 | __uint(type, BPF_MAP_TYPE_SK_STORAGE); 83 | __uint(map_flags, BPF_F_NO_PREALLOC); 84 | __type(key, int); 85 | __type(value, struct socket_storage); 86 | } sk_storage_map SEC(".maps"); 87 | 88 | #define PERCPU_ARRAY_SIZE 0x400 89 | struct { 90 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 91 | __type(key, int); 92 | __uint(value_size, PERCPU_ARRAY_SIZE); 93 | __uint(max_entries, 1); 94 | } percpu_array_map SEC(".maps"); 95 | 96 | struct { 97 | __uint(type, BPF_MAP_TYPE_HASH); 98 | __uint(max_entries, MAX_REUSEPORT_GROUPS); 99 | __type(key, struct reuseport_storage_key); 100 | __type(value, struct reuseport_storage); 101 | } reuseport_storage_map SEC(".maps"); 102 | 103 | struct { 104 | __uint(type, BPF_MAP_TYPE_SOCKHASH); 105 | __uint(max_entries, MAX_SOCKETS_IN_GEN *MAX_GENS); 106 | __uint(key_size, sizeof(uint64_t)); 107 | __uint(value_size, sizeof(uint64_t)); 108 | } sockhash SEC(".maps"); 109 | 110 | struct { 111 | __uint(type, BPF_MAP_TYPE_LRU_HASH); 112 | __uint(map_flags, BPF_F_NO_COMMON_LRU); // LRU should be per-CPU, for undetermined 113 | // speed gains on flow table contention. 114 | __uint(max_entries, MAX_TOTAL_FLOWS); 115 | __uint(key_size, sizeof(struct lru_key)); 116 | __uint(value_size, sizeof(struct lru_value)); 117 | } lru_map SEC(".maps"); 118 | 119 | #define METRIC_INC(token) __sync_fetch_and_add(&state->token, 1ULL) 120 | 121 | struct ip_flow_hash { 122 | uint32_t remote_ip[4]; 123 | uint32_t reuseport_group_id; 124 | uint16_t remote_port; 125 | } __attribute__((packed)); 126 | -------------------------------------------------------------------------------- /tests/test_tubular.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | from . import base 6 | from .lsocket import * 7 | import socket 8 | import struct 9 | 10 | 11 | def tubular_recv(uds): 12 | cd, _ = uds.accept() 13 | msg, ancdata, _, _ = cd.recvmsg( 14 | 4096, socket.CMSG_LEN(struct.calcsize("i"))) 15 | cd.send(b'OK') 16 | cd.close() 17 | 18 | if not ancdata: 19 | raise RuntimeError("No file descriptor received") 20 | 21 | cmsg_level, cmsg_type, cmsg_data = ancdata[0] 22 | 23 | if cmsg_level != socket.SOL_SOCKET or cmsg_type != socket.SCM_RIGHTS: 24 | raise RuntimeError("Unexpected control message") 25 | 26 | received_fd = struct.unpack("i", cmsg_data)[0] 27 | tmp_sd = socket.fromfd(received_fd, 0, 0, 0) 28 | domain = tmp_sd.getsockopt(socket.SOL_SOCKET, socket.SO_DOMAIN) 29 | type = tmp_sd.getsockopt(socket.SOL_SOCKET, socket.SO_TYPE) 30 | protocol = tmp_sd.getsockopt(socket.SOL_SOCKET, socket.SO_PROTOCOL) 31 | fd = base._socket(domain, type, protocol, fileno=received_fd) 32 | tmp_sd.close() 33 | return msg, fd 34 | 35 | 36 | class BasicTubular(base.TestCase): 37 | def test_one(self): 38 | uds, uds_fname = self.uds() 39 | p = self.udpgrm_run("--daemon --install --tubular=%s" % (uds_fname,)) 40 | self.assertTrue(p.collect_stderr("Tubular path")[0]) 41 | self.assertTrue(p.collect_stderr("Tailing")[0]) 42 | 43 | fd_a = p.fd_count() 44 | 45 | sa, port = self.bind() 46 | 47 | label = b'udp_server' 48 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_DISSECTOR, 49 | struct.pack("IIII100s", 50 | DISSECTOR_FLOW, 126, 0, 0, 51 | label + b'\x00' * (100-len(label)))) 52 | 53 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 0) 54 | self.assertTrue(p.collect_stdout("socket found")[0]) 55 | 56 | self.assertEqual(fd_a+1, p.fd_count()) 57 | 58 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_WORKING_GEN, 0) 59 | self.assertTrue(p.collect_stdout("Working gen app=0 0")[0]) 60 | 61 | cd, _ = uds.accept() 62 | self.assertEqual(cd.recv(99), b"udp_server#") 63 | cd.send(b'OK') 64 | cd.close() 65 | 66 | # no message 67 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_WORKING_GEN, 0) 68 | self.assertTrue(p.collect_stdout("Working gen app=0 0")[0]) 69 | 70 | self.assertTrue(p.collect_stdout( 71 | "No new sockets to register to tubular")[0]) 72 | 73 | self.assertEqual(fd_a, p.fd_count()) 74 | 75 | self.assertIn( 76 | "Tubular register failed: Bad file descriptor", p.stdout_line()) 77 | 78 | self.assertEqual(fd_a, p.fd_count()) 79 | 80 | def test_sockets_leak(self): 81 | uds_fname = '/bad_path' 82 | p = self.udpgrm_run("--daemon --install --tubular=%s" % (uds_fname,)) 83 | self.assertTrue(p.collect_stderr("Tubular path")[0]) 84 | self.assertTrue(p.collect_stderr("Tailing")[0]) 85 | fd_a = p.fd_count() 86 | 87 | sa, port = self.bind() 88 | 89 | label = b'udp_server' 90 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_DISSECTOR, 91 | struct.pack("IIII100s", 92 | DISSECTOR_FLOW, 126, 0, 0, 93 | label + b'\x00' * (100-len(label)))) 94 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 0) 95 | self.sync_socket_gen(sa) 96 | self.assertEqual(fd_a+1, p.fd_count()) 97 | 98 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_WORKING_GEN, 0) 99 | self.assertTrue(p.collect_stdout("Tubular register failed")[0]) 100 | 101 | self.assertEqual(fd_a, p.fd_count()) 102 | 103 | def test_good_sockets_registered(self): 104 | # we used to register wrong sockets when two groups were set 105 | # concurrently 106 | uds, uds_fname = self.uds() 107 | p = self.udpgrm_run("--daemon --install --tubular=%s" % (uds_fname,)) 108 | self.assertTrue(p.collect_stderr("Tubular path")[0]) 109 | self.assertTrue(p.collect_stderr("Tailing")[0]) 110 | fd_a = p.fd_count() 111 | 112 | # one 113 | sa, port_a = self.bind() 114 | label_a = b'udp_server_a' 115 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_DISSECTOR, 116 | struct.pack("IIII100s", 117 | DISSECTOR_FLOW, 126, 0, 0, 118 | label_a + b'\x00' * (100-len(label_a)))) 119 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 0) 120 | self.sync_socket_gen(sa) 121 | self.assertEqual(fd_a+1, p.fd_count()) 122 | 123 | # two 124 | sb, port_b = self.bind() 125 | self.assertNotEqual(port_a, port_b) 126 | label_b = b'udp_server_b' 127 | sb.setsockopt(IPPROTO_UDP, UDP_GRM_DISSECTOR, 128 | struct.pack("IIII100s", 129 | DISSECTOR_FLOW, 126, 0, 0, 130 | label_b + b'\x00' * (100-len(label_b)))) 131 | sb.setsockopt(IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 0) 132 | self.sync_socket_gen(sb) 133 | 134 | self.assertEqual(fd_a+2, p.fd_count()) 135 | 136 | # one 137 | sa.setsockopt(IPPROTO_UDP, UDP_GRM_WORKING_GEN, 0) 138 | self.assertTrue(p.collect_stdout("Working gen app=0 0")[0]) 139 | 140 | msg, recv_sd = tubular_recv(uds) 141 | self.assertEqual(msg, b"udp_server_a#") 142 | self.assertEqual(sa.cookie(), recv_sd.cookie()) 143 | recv_sd.close() 144 | 145 | # two 146 | sb.setsockopt(IPPROTO_UDP, UDP_GRM_WORKING_GEN, 0) 147 | self.assertTrue(p.collect_stdout("Working gen app=0 0")[0]) 148 | 149 | msg, recv_sd = tubular_recv(uds) 150 | self.assertEqual(msg, b"udp_server_b#") 151 | self.assertEqual(sb.cookie(), recv_sd.cookie()) 152 | recv_sd.close() 153 | 154 | # test leaked fds 155 | self.assertEqual(fd_a, p.fd_count()) 156 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: udpgrm examples/venv/.ok mmdecoy 2 | 3 | BPFTOOL?=bpftool 4 | BPFTOOL:=$(shell PATH=$$PATH:/usr/local/sbin:/usr/sbin which $(BPFTOOL)) 5 | 6 | ifeq ($(BPFTOOL),) 7 | $(error bpftool not found) 8 | endif 9 | 10 | 11 | CLANG_DIR?= 12 | CLANG_BIN?=$(CLANG_DIR)clang-18 13 | 14 | HOST_ARCH := $(shell uname -m) 15 | TARGET_ARCH ?= $(HOST_ARCH) 16 | 17 | ifeq ($(HOST_ARCH),x86_64) 18 | INCLUDES_X86_64 := /usr/include/x86_64-linux-gnu 19 | INCLUDES_AARCH64 := /usr/aarch64-linux-gnu/include 20 | else ifeq ($(HOST_ARCH),aarch64) 21 | INCLUDES_X86_64 := /usr/x86_64-linux-gnu/include 22 | INCLUDES_AARCH64 := /usr/include/aarch64-linux-gnu 23 | else 24 | $(error unsupported host architecture $(HOST_ARCH)) 25 | endif 26 | 27 | INCLUDES := 28 | CFLAGS := 29 | ifeq ($(TARGET_ARCH),x86_64) 30 | KERNEL_HEADER_DIRS := asm bits gnu sys 31 | define link-kernel-headers 32 | arch/$(TARGET_ARCH)/include/$1: 33 | mkdir -p arch/$(TARGET_ARCH)/include 34 | ln -sf $(INCLUDES_X86_64)/$1 $$@ 35 | endef 36 | 37 | $(foreach d,$(KERNEL_HEADER_DIRS),$(eval $(call link-kernel-headers,$(d)))) 38 | KERNEL_HEADERS:=$(foreach d,$(KERNEL_HEADER_DIRS),arch/$(TARGET_ARCH)/include/$(d)) 39 | 40 | INCLUDES += -I$(shell pwd)/arch/$(TARGET_ARCH)/include 41 | CFLAGS += -target x86_64-pc-linux-gnu 42 | ifneq ($(HOST_ARCH),x86_64) 43 | CFLAGS += --ld-path=/usr/x86_64-linux-gnu/bin/ld 44 | endif 45 | else ifeq ($(TARGET_ARCH),aarch64) 46 | INCLUDES += -I$(INCLUDES_AARCH64) 47 | CFLAGS += -target aarch64-pc-linux-gnu 48 | ifneq ($(HOST_ARCH),aarch64) 49 | CFLAGS += --ld-path=/usr/aarch64-linux-gnu/bin/ld 50 | endif 51 | endif 52 | 53 | 54 | EBPF_SOURCE=ebpf/*.c 55 | EBPF_HEADERS=include/udpgrm*.h ebpf/*.h 56 | EBPF_DEPS=$(EBPF_SOURCE) $(EBPF_HEADERS) Makefile $(KERNEL_HEADERS) 57 | ebpf.o: $(EBPF_DEPS) Makefile $(KERNEL_HEADERS) 58 | $(CLANG_BIN) \ 59 | $(CFLAGS) $(EXTRA_CFLAGS) \ 60 | -g -O2 -Wall -Wextra -target bpf -mcpu=v3 \ 61 | -Wall -Wextra -fwrapv -Wno-address-of-packed-member \ 62 | ebpf/ebpf.c \ 63 | $(INCLUDES) \ 64 | -c -o $@ 65 | .PHONY: info 66 | info: $(EBPF_DEPS) Makefile $(KERNEL_HEADERS) 67 | rm -f ebpf.o ebpf.su bpftool-log.tmp cov.info 68 | rm -rf cov_verifier_html 69 | $(MAKE) ebpf.o EXTRA_CFLAGS="-fstack-usage" 70 | sudo $(BPFTOOL) prog -d loadall ebpf.o \ 71 | /sys/fs/bpf/tmp-info-load > bpftool-log.tmp 2>&1 72 | sudo rm -rf /sys/fs/bpf/tmp-info-load 73 | rm ebpf.o 74 | @echo "**** stack usage by function ****" 75 | @cat ebpf.su | sed -E 's/^([^ ]*):(.*)/\1\t\2/' | egrep -v "\s0\s" | column -t 76 | @echo "**** verifier instruction count ****" 77 | @egrep -e "(BEGIN PROG LOAD LOG)|^processed" bpftool-log.tmp | awk -F"'" '/BEGIN PROG LOAD LOG/ {name=$$2} /processed/ {print name,"\t", $$0}' |column -t 78 | @echo "*** verifier instruction count expressed as code coverage ****" 79 | @cat bpftool-log.tmp | bash tools/verifier_log_to_cov_2.sh > cov.info 80 | @genhtml -q cov.info -o cov_verifier_html --config-file .lcovrc --ignore-errors unmapped,unmapped --synthesize-missing 81 | @echo "xdg-open file://$(CURDIR)/cov_verifier_html/ebpf/ebpf/ebpf.c.gcov.html" 82 | 83 | 84 | ebpf.skel.h: ebpf.o 85 | $(BPFTOOL) gen skeleton ebpf.o name ebpf > $@ 86 | 87 | UDPGRM_SOURCE=src/*.c 88 | UDPGRM_HEADERS=src/*.h include/udpgrm*.h 89 | UDPGRM_DEPS=$(UDPGRM_SOURCE) $(UDPGRM_HEADERS) ebpf.skel.h 90 | 91 | udpgrm: $(UDPGRM_DEPS) 92 | $(CLANG_BIN) \ 93 | $(UDPGRM_SOURCE) \ 94 | $(CFLAGS) \ 95 | $(LDFLAGS) \ 96 | $(EXTRA_CFLAGS) \ 97 | -g -O2 -Wall -Wextra -fwrapv -fno-omit-frame-pointer \ 98 | -lbpf -lelf -lz -lsystemd \ 99 | -DPACKAGE_VERSION=\"$(VERSION)\" \ 100 | $(LIBS) \ 101 | -o $@ 102 | 103 | 104 | MMDECOY_DEPS=tools/mmdecoy.c 105 | mmdecoy: $(MMDECOY_DEPS) 106 | $(CLANG_BIN) \ 107 | $(MMDECOY_DEPS) \ 108 | $(CFLAGS) \ 109 | $(LDFLAGS) \ 110 | $(EXTRA_CFLAGS) \ 111 | -g -O2 -Wall -Wextra -fno-omit-frame-pointer \ 112 | -lsystemd \ 113 | $(LIBS) \ 114 | -o mmdecoy 115 | 116 | udpgrm-test: $(UDPGRM_DEPS) 117 | rm -f udpgrm 118 | $(MAKE) udpgrm EXTRA_CFLAGS="-fprofile-instr-generate -fcoverage-mapping" 119 | mv -f udpgrm $@ 120 | 121 | 122 | tqserver: crates/udpgrm/examples/*.rs 123 | (cd crates/udpgrm; cargo build --release --example $@) 124 | cp crates/udpgrm/target/release/examples/$@ $@ 125 | 126 | client: crates/udpgrm/examples/*.rs 127 | (cd crates/udpgrm; cargo build --release --example $@) 128 | cp crates/udpgrm/target/release/examples/$@ $@ 129 | 130 | .PHONY: format 131 | format: 132 | @which markdownfmt || (echo '[*] Install markdownfmt:\n\tgo install github.com/shurcooL/markdownfmt@latest'; exit 1) 133 | $(CLANG_DIR)clang-format -i \ 134 | $(UDPGRM_SOURCE) \ 135 | $(UDPGRM_HEADERS) \ 136 | $(EBPF_SOURCE) \ 137 | $(EBPF_HEADERS) \ 138 | $(MMDECOY_DEPS) 139 | autopep8 -i *.py tests/*py examples/*py 140 | markdownfmt -w README.md 141 | @grep -n "TODO" *.[ch] *.md || true 142 | 143 | examples/venv/.ok: 144 | (cd examples; virtualenv venv) 145 | (cd examples; ./venv/bin/pip install -r requirements.txt) 146 | touch $@ 147 | 148 | DATE := $(shell date -u '+%Y.%-m.%-d') 149 | BUILD_NUMBER ?= 0 150 | REVISION := $(shell git rev-parse --short HEAD) 151 | TIMESTAMP := $(shell date -u '+%Y-%m-%d-%H:%MUTC') 152 | 153 | VERSION := $(shell git describe --tags --always --exclude 'crate/*') 154 | 155 | ifndef CI 156 | VERSION := $(VERSION)-dev 157 | endif 158 | 159 | 160 | .PHONY: clean 161 | clean: 162 | rm -f ebpf.skel.h ebpf.o udpgrm udpgrm_*.deb mmdecoy bpftool-log.tmp client tqserver udpgrm-test ebpf.su cov.info 163 | rm -rf arch cov_html crates/udpgrm/target/release/examples crates/udpgrm/target/debug/examples cov_verifier_html 164 | 165 | TEST:=tests 166 | .PHONY: test 167 | test: udpgrm-test tqserver client 168 | @rm -rf *.profraw cov_html coverage.profdata 169 | sudo \ 170 | LLVM_PROFILE_FILE="udpgrm_%p.profraw" \ 171 | UDPGRMBIN="./udpgrm-test" \ 172 | PYTHONPATH=. PYTHONIOENCODING=utf-8 \ 173 | python3 -m tests.runner $(TEST) 174 | llvm-profdata merge -sparse udpgrm*.profraw -o coverage.profdata 175 | llvm-cov export \ 176 | --instr-profile=coverage.profdata \ 177 | ./udpgrm-test --format=lcov > cov.info 178 | genhtml -q cov.info -o cov_html -t udpgrm --config-file .lcovrc --ignore-errors unmapped 179 | llvm-cov report ./udpgrm-test -instr-profile=coverage.profdata 180 | @rm -rf *.profraw cov.info coverage.profdata 181 | @echo "[*] Run:\n xdg-open file://$(CURDIR)/cov_html/udpgrm/index.html" 182 | -------------------------------------------------------------------------------- /src/tubular.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "common.h" 16 | #include "list.h" 17 | 18 | #define MIN(a, b) (((a) < (b)) ? (a) : (b)) 19 | 20 | #define TIMESPEC_NSEC(ts) ((ts)->tv_sec * 1000000000ULL + (ts)->tv_nsec) 21 | 22 | uint64_t realtime_now() 23 | { 24 | struct timespec ts; 25 | clock_gettime(CLOCK_MONOTONIC, &ts); 26 | return TIMESPEC_NSEC(&ts); 27 | } 28 | /* Ctrl buffer must have CMSG_SPACE(256*sizeof(int)) space at least. */ 29 | static void set_scm_rights_cmsg(struct msghdr *msgh, int single_ctrl_sz, int fds[], 30 | int fds_num) 31 | { 32 | int space = 0; 33 | /* To ensure CMSG_NXTHDR contorllen must be 34 | * large and the buffer must be zeroed. */ 35 | msgh->msg_controllen = single_ctrl_sz; 36 | memset(msgh->msg_control, 0, msgh->msg_controllen); 37 | 38 | struct cmsghdr *cmsg = CMSG_FIRSTHDR(msgh); 39 | cmsg->cmsg_level = SOL_SOCKET; 40 | cmsg->cmsg_type = SCM_RIGHTS; 41 | cmsg->cmsg_len = CMSG_LEN(sizeof(int) * fds_num); 42 | memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * fds_num); 43 | space += CMSG_SPACE(sizeof(int) * fds_num); 44 | 45 | // cmsg = CMSG_NXTHDR(msgh, cmsg); 46 | // space += CMSG_SPACE(sizeof(val)); 47 | 48 | msgh->msg_controllen = space; 49 | } 50 | 51 | /* This is synchronous */ 52 | static int tubular_register(char *tubular_path, char *label, int fds[], int fds_num) 53 | { 54 | int s = socket(AF_UNIX, SOCK_SEQPACKET, 0); 55 | if (s < 0) { 56 | return errno; 57 | } 58 | 59 | struct timeval timeout = {.tv_sec = 1}; 60 | 61 | setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)); 62 | setsockopt(s, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)); 63 | 64 | struct sockaddr_un sun = {}; 65 | sun.sun_family = AF_UNIX; 66 | memcpy(sun.sun_path, tubular_path, 67 | MIN(strlen(tubular_path), sizeof(sun.sun_path))); 68 | int r = connect(s, (struct sockaddr *)&sun, 69 | net_ss_size((struct sockaddr_storage *)&sun)); 70 | if (r != 0) { 71 | close(s); 72 | return errno; 73 | } 74 | 75 | // sendmsg(); 76 | char payload[256]; 77 | snprintf(payload, sizeof(payload), "%s#", label); 78 | 79 | struct iovec iovec = { 80 | .iov_base = payload, 81 | .iov_len = strlen(payload), 82 | }; 83 | 84 | int ctrl_sz = CMSG_SPACE(sizeof(int) * fds_num) + 1024; 85 | char ctrl[ctrl_sz]; 86 | 87 | struct msghdr msg = { 88 | .msg_iov = &iovec, 89 | .msg_iovlen = 1, 90 | .msg_control = ctrl, 91 | .msg_controllen = ctrl_sz, 92 | }; 93 | 94 | set_scm_rights_cmsg(&msg, ctrl_sz, fds, fds_num); 95 | r = sendmsg(s, &msg, 0); 96 | if (r != (int)strlen(payload)) { 97 | close(s); 98 | return errno; 99 | } 100 | 101 | char buf[1024] = {}; 102 | int n = read(s, buf, sizeof(buf)); 103 | if (n < 0) { 104 | close(s); 105 | return errno; 106 | } 107 | 108 | close(s); 109 | if (n != 2 || strcmp(buf, "OK") != 0) { 110 | return EPROTO; 111 | } 112 | return 0; 113 | } 114 | 115 | LIST_HEAD(list_of_tubular_sockets); 116 | 117 | struct reuseport_group { 118 | struct list_head in_list; 119 | uint32_t random_id; 120 | uint64_t last_modified; 121 | int max_idx[MAX_GENS]; 122 | int sockets[MAX_GENS][MAX_SOCKETS_IN_GEN]; 123 | }; 124 | 125 | int reuseport_groups_empty() { return list_empty(&list_of_tubular_sockets); } 126 | 127 | struct reuseport_group *reuseport_group_lookup(struct reuseport_storage *state) 128 | { 129 | struct list_head *pos; 130 | list_for_each(pos, &list_of_tubular_sockets) 131 | { 132 | struct reuseport_group *sk_group = 133 | hlist_entry(pos, struct reuseport_group, in_list); 134 | if (sk_group->random_id == state->random_id) { 135 | return sk_group; 136 | } 137 | } 138 | return NULL; 139 | } 140 | 141 | struct reuseport_group *reuseport_group_lookup_or_add(struct reuseport_storage *state) 142 | { 143 | struct reuseport_group *sk_group = reuseport_group_lookup(state); 144 | if (sk_group) 145 | return sk_group; 146 | 147 | sk_group = calloc(1, sizeof(struct reuseport_group)); 148 | sk_group->random_id = state->random_id; 149 | sk_group->last_modified = realtime_now(); 150 | list_add(&sk_group->in_list, &list_of_tubular_sockets); 151 | return sk_group; 152 | } 153 | 154 | int reuseport_group_maybe_delete(struct reuseport_group *sk_group) 155 | { 156 | int g; 157 | for (g = 0; g < MAX_GENS; g++) { 158 | if (sk_group->max_idx[g] != 0) { 159 | return 0; 160 | } 161 | } 162 | list_del(&sk_group->in_list); 163 | free(sk_group); 164 | return 1; 165 | } 166 | 167 | void tubular_close_wg(struct reuseport_group *sk_group, int wg); 168 | 169 | void reuseport_groups_maybe_cleanup_stale() 170 | { 171 | uint64_t now = realtime_now(); 172 | struct list_head *pos, *tmp; 173 | list_for_each_safe(pos, tmp, &list_of_tubular_sockets) 174 | { 175 | struct reuseport_group *sk_group = 176 | hlist_entry(pos, struct reuseport_group, in_list); 177 | // 10 seconds 178 | if (now - sk_group->last_modified > 10 * 1000000000ULL) { 179 | printf("[#] cleaning up stale tubular sockets\n"); 180 | int g; 181 | for (g = 0; g < MAX_GENS; g++) { 182 | tubular_close_wg(sk_group, g); 183 | } 184 | reuseport_group_maybe_delete(sk_group); 185 | } 186 | } 187 | } 188 | 189 | /* Should caller close the top fd or did we steal it? */ 190 | int tubular_maybe_preserve_fd(struct reuseport_storage *state, int gen, int gen_len, 191 | int free_pos, int f) 192 | { 193 | if (state->dis.label[0] != '\x00') { 194 | struct reuseport_group *sk_group = reuseport_group_lookup_or_add(state); 195 | sk_group->max_idx[gen % MAX_GENS] = gen_len; 196 | int *fd_ptr = &sk_group->sockets[gen % MAX_GENS][free_pos]; 197 | if (*fd_ptr > 0) 198 | close(*fd_ptr); 199 | *fd_ptr = f; 200 | sk_group->last_modified = realtime_now(); 201 | return 0; 202 | } 203 | return 1; 204 | } 205 | 206 | void tubular_close_wg(struct reuseport_group *sk_group, int wg) 207 | { 208 | int i; 209 | int max_idx = sk_group->max_idx[wg % MAX_GENS]; 210 | for (i = 0; i < max_idx; i++) { 211 | int *fd_ptr = &sk_group->sockets[wg % MAX_GENS][i]; 212 | if (*fd_ptr > 0) 213 | close(*fd_ptr); 214 | *fd_ptr = -1; 215 | } 216 | sk_group->max_idx[wg % MAX_GENS] = 0; 217 | } 218 | 219 | /* returns errno */ 220 | int tubular_maybe_register(struct reuseport_storage *state, int wg, char *tubular_path) 221 | { 222 | int err = 0; 223 | if (state->dis.label[0] != '\x00') { 224 | struct reuseport_group *sk_group = reuseport_group_lookup(state); 225 | 226 | if (tubular_path == NULL) { 227 | err = ENOENT; 228 | } else if (sk_group == NULL) { 229 | // No fds, we could send empty message to tubular 230 | printf("No new sockets to register to tubular wg=%d\n", wg); 231 | err = EBADF; 232 | } else { 233 | int max_idx = sk_group->max_idx[wg]; 234 | char label[LABEL_SZ + 1]; 235 | memcpy(label, state->dis.label, LABEL_SZ); 236 | label[LABEL_SZ] = '\x00'; 237 | err = tubular_register(tubular_path, label, sk_group->sockets[wg], 238 | max_idx); 239 | } 240 | 241 | if (sk_group) { 242 | tubular_close_wg(sk_group, wg); 243 | reuseport_group_maybe_delete(sk_group); 244 | } 245 | } 246 | return err; 247 | } 248 | -------------------------------------------------------------------------------- /include/udpgrm_internal.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-only 2 | /* Copyright (c) 2025 Cloudflare, Inc. 3 | * Licensed under either 4 | * - the Apache 2.0 license found in the LICENSE file, or 5 | * - the GNU General Public License Version 2 found in the ebpf/LICENSE file 6 | * at your option. The licenses are also available online at, respectively: 7 | * https://opensource.org/license/apache-2-0 8 | * https://opensource.org/license/gpl-2-0 9 | */ 10 | 11 | /* When writing ebpf there is always a need to share structures 12 | * between userspace program and ebpf. This file contains struct 13 | * definitions used by both. This is not quite ABI, but it's 14 | * close. There are some changes allowed, like growing the size of the 15 | * structs, but care should be taken, to ensure the changed userspace 16 | * will keep on producing reasonable results even if loaded ebpf is 17 | * older. */ 18 | 19 | #include "udpgrm.h" 20 | 21 | #define _UDP_GRM_MIN 200 22 | #define _UDP_GRM_MAX 204 23 | 24 | /* Don't change without rethinking grm_cookie_pack/grm_cookie_unpack */ 25 | #define MAX_SOCKETS_IN_GEN 256 26 | #define MAX_GENS 32 27 | 28 | /* The should be keyed also by netns cookie, device index 29 | * (BINDTODEVICE), and possibly REUSEPORT uid. */ 30 | struct reuseport_storage_key { 31 | /* AF_INET or AF_INET6; type is always SOCK_DGRAM, protocool is always IPPROTO_UDP 32 | */ 33 | uint8_t family; 34 | uint8_t _reserved; 35 | uint16_t src_port; 36 | union { 37 | uint32_t src_ip4; 38 | uint32_t src_ip6[4]; 39 | }; 40 | } __attribute__((__packed__)); 41 | 42 | #define MAX_APPS 4 43 | 44 | struct udp_grm_working_gen { 45 | uint32_t working_gen; 46 | } __attribute__((packed)); 47 | 48 | #define FLOW_DEFAULT_TIMEOUT_SEC 125 49 | 50 | /* Reuseport group */ 51 | struct reuseport_storage { 52 | uint8_t verbose; 53 | 54 | /* Purely for information. */ 55 | uint64_t netns_cookie; 56 | 57 | /* ID to distinguish reuseport groups one from another */ 58 | uint32_t random_id; 59 | 60 | struct udp_grm_dissector dis; 61 | 62 | uint32_t working_gen[MAX_APPS]; 63 | uint32_t max_idx[MAX_GENS]; 64 | uint64_t cookies[MAX_GENS][MAX_SOCKETS_IN_GEN]; 65 | 66 | /* Metrics. Remember about forward compat of this struct in 67 | * case userspace is newer version. */ 68 | /* 0. Set from the userspace daemon */ 69 | uint64_t socket_critical_gauge; 70 | uint64_t socket_critical; 71 | 72 | /* 1. Packet processing */ 73 | uint64_t rx_processed_total; 74 | uint64_t rx_internal_state_error; 75 | uint64_t rx_cbpf_prog_error; 76 | uint64_t rx_packet_too_short_error; 77 | 78 | /* 2. Existing flows */ 79 | uint64_t rx_dissected_ok_total; 80 | uint64_t rx_flow_ok; /* Flow entry or cookie found, and dispatch went fine. */ 81 | uint64_t rx_flow_rg_conflict; /* Socket chosen from wrong reuseport group */ 82 | uint64_t rx_flow_other_error; /* flow entry or socket cookie pointing to dead 83 | socket */ 84 | uint64_t rx_flow_new_unseen; /* Likely new/fresh flow. */ 85 | 86 | uint64_t rx_flow_new_had_expired; /* Subset of rx_flow_new_unseen, hitting expired 87 | flow entry, indicative of too short flow 88 | entry timeout perhaps. We can't know if the 89 | old cookie is legit or not, packet dispatched 90 | ot new flows */ 91 | uint64_t rx_flow_new_bad_cookie; /* Subset of rx_flow_new_unseen, 92 | * extracting cookie from packet worked, 93 | * but the cookie checksum was invalid. */ 94 | 95 | /* 3. New flows */ 96 | uint64_t rx_new_flow_total; 97 | uint64_t rx_new_flow_working_gen_dispatch_ok; 98 | uint64_t rx_new_flow_working_gen_dispatch_error; 99 | 100 | /* Sendmsg */ 101 | uint64_t tx_total; 102 | uint64_t tx_flow_create_ok; 103 | uint64_t tx_flow_create_from_expired_ok; /* Subset of above. */ 104 | uint64_t tx_flow_create_error; 105 | uint64_t tx_flow_update_ok; 106 | uint64_t tx_flow_update_conflict; 107 | }; 108 | 109 | enum { 110 | MSG_LOG, 111 | MSG_REGISTER_SOCKET, 112 | MSG_SET_WORKING_GEN, 113 | MSG_SET_DISSECTOR, 114 | GSM_SET_COOKIES, 115 | GSM_SET_SOCKET_CRITICAL_GAUGE, 116 | GSM_INCR_SOCKET_CRITICAL, 117 | }; 118 | 119 | /* 128 bytes. Size is important, since the ringbuffer is prone to overflow. Use pahole */ 120 | struct msg_value { 121 | int type; 122 | struct reuseport_storage_key skey; 123 | union { 124 | struct { 125 | uint32_t app_idx; 126 | uint32_t app_working_gen; 127 | uint64_t app_so_cookie; 128 | }; 129 | struct { 130 | int pid; 131 | uint64_t socket_cookie; 132 | uint32_t socket_gen; 133 | uint32_t socket_app; 134 | }; 135 | char log[100]; 136 | struct { 137 | uint32_t sock_gen; 138 | uint32_t sock_idx; 139 | uint32_t sock_gen_len; 140 | uint64_t sock_cookie; 141 | }; 142 | int value; 143 | }; 144 | }; 145 | 146 | /* Per socket state. Created on setsockopt. */ 147 | struct socket_storage { 148 | uint32_t sock_gen; 149 | uint32_t sock_idx; 150 | uint32_t sock_app; 151 | 152 | // socket cookie is not accesible from setsockopt context, 153 | // however it is accessible from bind() 154 | uint64_t so_cookie; 155 | 156 | uint64_t netns_cookie; 157 | }; 158 | 159 | struct lru_key { 160 | uint32_t rx_hash; 161 | } __attribute__((__packed__)); 162 | 163 | struct lru_value { 164 | uint64_t last_tx_ns; 165 | uint64_t cookie; 166 | }; 167 | 168 | /* This is lossy. */ 169 | #define TO_WRK_GEN(_max_apps, app_idx, working_gen) \ 170 | ({ \ 171 | uint32_t max_apps = (_max_apps); \ 172 | if (max_apps == 0) \ 173 | max_apps = 1; \ 174 | uint32_t slot_size = MAX_GENS / max_apps; \ 175 | ((app_idx)*slot_size + ((working_gen) % slot_size)); \ 176 | }) 177 | 178 | /* Metrics definitions */ 179 | typedef struct { 180 | char package_version[32]; 181 | } metrics_t; 182 | 183 | #define GRM_COOKIE_CS(sock_gen, sock_idx) \ 184 | ({ \ 185 | uint8_t cs = (0xD ^ ((sock_idx >> 4) & 0xF) ^ (sock_idx & 0xF) ^ \ 186 | ((sock_gen >> 4) & 0xF) ^ (sock_gen & 0xF)); \ 187 | cs = (cs & 0x7) ^ ((cs >> 3) & 0x1); \ 188 | cs; \ 189 | }) 190 | 191 | /* Totally assuming MAX_SOCKETS_IN_GEN=256 and MAX_GENS=32 !*/ 192 | __attribute__((unused)) static int 193 | grm_cookie_unpack(uint16_t grm_cookie, uint32_t *sock_gen_ptr, uint32_t *sock_idx_ptr) 194 | { 195 | uint32_t sock_gen = grm_cookie & 0x1f; 196 | uint32_t sock_idx = (grm_cookie >> 8) & 0xff; 197 | uint8_t cs_from_cookie = (grm_cookie & 0xff) >> 5; 198 | 199 | uint8_t cs = GRM_COOKIE_CS(sock_gen, sock_idx); 200 | if (cs_from_cookie != cs) { 201 | return -1; 202 | } 203 | *sock_gen_ptr = sock_gen; 204 | *sock_idx_ptr = sock_idx; 205 | return 0; 206 | } 207 | 208 | __attribute__((unused)) static void grm_cookie_pack(uint32_t sock_gen, uint32_t sock_idx, 209 | uint8_t *v) 210 | { 211 | sock_gen &= 0x1F; 212 | sock_idx &= 0xFF; 213 | uint8_t cs = GRM_COOKIE_CS(sock_gen, sock_idx); 214 | 215 | v[0] = (sock_gen & 0x1F) | (cs << 5); 216 | v[1] = sock_idx & 0xFF; 217 | v[2] = 0; 218 | v[3] = 0; 219 | } 220 | -------------------------------------------------------------------------------- /ebpf/ebpf_aes128.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0-only 2 | // Copyright (c) 2025 Cloudflare, Inc. 3 | // Licensed under the GNU General Public License Version 2 found in the ebpf/LICENSE file or at: 4 | // https://opensource.org/license/gpl-2-0 5 | 6 | struct AES_ctx { 7 | uint8_t round_key[176]; 8 | }; 9 | 10 | #define AES_STATE_COLUMNS 4 // The number of columns comprising a state in AES. 11 | #define AES_KEY_WORDS 4 // The number of 32 bit words in a key. 12 | #define AES_ROUND_COUNT 10 // The number of rounds in AES Cipher. 13 | 14 | static const uint8_t aes_sbox[256] = { 15 | 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 16 | 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 17 | 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 18 | 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 19 | 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 20 | 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 21 | 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 22 | 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 23 | 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 24 | 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 25 | 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 26 | 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 27 | 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 28 | 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 29 | 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 30 | 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 31 | 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 32 | 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 33 | 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 34 | 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; 35 | 36 | static const uint8_t round_constants[11] = {0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 37 | 0x20, 0x40, 0x80, 0x1b, 0x36}; 38 | 39 | __attribute__((noinline)) int AES_key_expand(struct AES_ctx *restrict ctx, 40 | const struct aes_key *restrict key) 41 | { 42 | if (ctx == NULL || key == NULL) 43 | return -1; 44 | uint8_t *round_key = ctx->round_key; 45 | 46 | unsigned i, j, k; 47 | uint8_t tmp[4]; // Used for the column/row operations 48 | 49 | #pragma unroll 50 | for (i = 0; i < AES_KEY_WORDS; ++i) { 51 | round_key[(i * 4) + 0] = key->u8[(i * 4) + 0]; 52 | round_key[(i * 4) + 1] = key->u8[(i * 4) + 1]; 53 | round_key[(i * 4) + 2] = key->u8[(i * 4) + 2]; 54 | round_key[(i * 4) + 3] = key->u8[(i * 4) + 3]; 55 | } 56 | 57 | // increases stack on unroll 58 | #pragma nounroll 59 | for (i = AES_KEY_WORDS; i < AES_STATE_COLUMNS * (AES_ROUND_COUNT + 1); ++i) { 60 | { 61 | k = (i - 1) * 4; 62 | tmp[0] = round_key[k + 0]; 63 | tmp[1] = round_key[k + 1]; 64 | tmp[2] = round_key[k + 2]; 65 | tmp[3] = round_key[k + 3]; 66 | } 67 | 68 | if (i % AES_KEY_WORDS == 0) { 69 | { 70 | const uint8_t u8tmp = tmp[0]; 71 | tmp[0] = tmp[1]; 72 | tmp[1] = tmp[2]; 73 | tmp[2] = tmp[3]; 74 | tmp[3] = u8tmp; 75 | } 76 | { 77 | tmp[0] = aes_sbox[tmp[0]]; 78 | tmp[1] = aes_sbox[tmp[1]]; 79 | tmp[2] = aes_sbox[tmp[2]]; 80 | tmp[3] = aes_sbox[tmp[3]]; 81 | } 82 | 83 | tmp[0] = tmp[0] ^ round_constants[i / AES_KEY_WORDS]; 84 | } 85 | j = i * 4; 86 | k = (i - AES_KEY_WORDS) * 4; 87 | round_key[j + 0] = round_key[k + 0] ^ tmp[0]; 88 | round_key[j + 1] = round_key[k + 1] ^ tmp[1]; 89 | round_key[j + 2] = round_key[k + 2] ^ tmp[2]; 90 | round_key[j + 3] = round_key[k + 3] ^ tmp[3]; 91 | } 92 | return 0; 93 | } 94 | 95 | /* Ensure not inlined - reduces stack usage */ 96 | __attribute__((noinline)) int add_round_key(uint8_t round, struct aes_key *state, 97 | const struct AES_ctx *ctx) 98 | { 99 | if (state == NULL || ctx == NULL) 100 | return -1; 101 | if (round > AES_ROUND_COUNT) 102 | return -1; 103 | 104 | uint8_t i, j; 105 | for (i = 0; i < 4; ++i) { 106 | for (j = 0; j < 4; ++j) { 107 | uint8_t p = (round * AES_STATE_COLUMNS * 4) + 108 | (i * AES_STATE_COLUMNS) + j; 109 | // this is never > 175 but hey, verifier has hard time. 110 | uint8_t x = ctx->round_key[p % 176]; 111 | state->u8[(i * 4) + j] ^= x; 112 | } 113 | } 114 | return 0; 115 | } 116 | 117 | static void sub_bytes(struct aes_key *state) 118 | { 119 | uint8_t i, j; 120 | for (i = 0; i < 4; ++i) { 121 | for (j = 0; j < 4; ++j) { 122 | state->u8[(j * 4) + i] = aes_sbox[state->u8[(j * 4) + i]]; 123 | } 124 | } 125 | } 126 | 127 | static void shift_rows(struct aes_key *state) 128 | { 129 | uint8_t tmp; 130 | 131 | // Rotate first row 1 columns to left 132 | tmp = state->u8[(0 * 4) + 1]; 133 | state->u8[(0 * 4) + 1] = state->u8[(1 * 4) + 1]; 134 | state->u8[(1 * 4) + 1] = state->u8[(2 * 4) + 1]; 135 | state->u8[(2 * 4) + 1] = state->u8[(3 * 4) + 1]; 136 | state->u8[(3 * 4) + 1] = tmp; 137 | 138 | // Rotate second row 2 columns to left 139 | tmp = state->u8[(0 * 4) + 2]; 140 | state->u8[(0 * 4) + 2] = state->u8[(2 * 4) + 2]; 141 | state->u8[(2 * 4) + 2] = tmp; 142 | 143 | tmp = state->u8[(1 * 4) + 2]; 144 | state->u8[(1 * 4) + 2] = state->u8[(3 * 4) + 2]; 145 | state->u8[(3 * 4) + 2] = tmp; 146 | 147 | // Rotate third row 3 columns to left 148 | tmp = state->u8[(0 * 4) + 3]; 149 | state->u8[(0 * 4) + 3] = state->u8[(3 * 4) + 3]; 150 | state->u8[(3 * 4) + 3] = state->u8[(2 * 4) + 3]; 151 | state->u8[(2 * 4) + 3] = state->u8[(1 * 4) + 3]; 152 | state->u8[(1 * 4) + 3] = tmp; 153 | } 154 | 155 | #define gf_multiply_by_2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x1b)) 156 | 157 | // MixColumns function mixes the columns of the state matrix 158 | static int mix_columns(struct aes_key *state) 159 | { 160 | uint8_t i; 161 | uint8_t tmp, tm; 162 | // Unrolling this loop makes stack blow up from 32 to 256B 163 | #pragma nounroll 164 | for (i = 0; i < 4; ++i) { 165 | const uint8_t t = state->u8[(i * 4) + 0]; 166 | tmp = state->u8[(i * 4) + 0] ^ state->u8[(i * 4) + 1] ^ 167 | state->u8[(i * 4) + 2] ^ state->u8[(i * 4) + 3]; 168 | tm = state->u8[(i * 4) + 0] ^ state->u8[(i * 4) + 1]; 169 | tm = gf_multiply_by_2(tm); 170 | state->u8[(i * 4) + 0] ^= tm ^ tmp; 171 | tm = state->u8[(i * 4) + 1] ^ state->u8[(i * 4) + 2]; 172 | tm = gf_multiply_by_2(tm); 173 | state->u8[(i * 4) + 1] ^= tm ^ tmp; 174 | tm = state->u8[(i * 4) + 2] ^ state->u8[(i * 4) + 3]; 175 | tm = gf_multiply_by_2(tm); 176 | state->u8[(i * 4) + 2] ^= tm ^ tmp; 177 | tm = state->u8[(i * 4) + 3] ^ t; 178 | tm = gf_multiply_by_2(tm); 179 | state->u8[(i * 4) + 3] ^= tm ^ tmp; 180 | } 181 | return 0; 182 | } 183 | 184 | /* Costs 30k to inline */ 185 | __attribute__((noinline)) int AES_ECB_encrypt(const struct AES_ctx *restrict ctx, 186 | struct aes_key *restrict state) 187 | { 188 | if (ctx == NULL || state == NULL) 189 | return -1; 190 | 191 | uint8_t round = 0; 192 | 193 | add_round_key(0, state, ctx); 194 | #pragma unroll 195 | for (round = 1;; ++round) { 196 | sub_bytes(state); 197 | shift_rows(state); 198 | if (round == AES_ROUND_COUNT) { 199 | break; 200 | } 201 | mix_columns(state); 202 | add_round_key(round, state, ctx); 203 | } 204 | add_round_key(AES_ROUND_COUNT, state, ctx); 205 | return 0; 206 | } 207 | -------------------------------------------------------------------------------- /examples/http3_simple_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Cloudflare, Inc. 2 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | # https://opensource.org/licenses/Apache-2.0 4 | 5 | import argparse 6 | import ipaddress 7 | import aioquic 8 | import asyncio 9 | import socket 10 | import os 11 | import sys 12 | import struct 13 | from typing import Dict, Optional 14 | from aioquic.asyncio import serve 15 | from aioquic.h3.connection import H3_ALPN 16 | from aioquic.h3.events import H3Event, HeadersReceived, DataReceived 17 | from aioquic.quic.configuration import QuicConfiguration 18 | from aioquic.quic.events import ProtocolNegotiated, HandshakeCompleted 19 | from socket import SOL_SOCKET, SO_DOMAIN, AF_INET, SOCK_DGRAM, SO_TYPE, SO_PROTOCOL, IPPROTO_UDP 20 | from aioquic.asyncio.protocol import QuicConnectionProtocol 21 | from aioquic.h3.connection import H3_ALPN, H3Connection 22 | import systemd 23 | import systemd.daemon 24 | 25 | import struct 26 | import binascii 27 | 28 | SO_COOKIE = 57 29 | 30 | 31 | listenfds = int(os.environ.get('LISTEN_FDS', '0')) 32 | fdnames = list(filter(bool, os.environ.get('LISTEN_FDNAMES', '').split(':'))) 33 | 34 | 35 | def get_inherited_sockets(listenfds=32, protocol=IPPROTO_UDP): 36 | SOCKETS = [] 37 | for fd in range(3, listenfds+3): 38 | # In python we need socket object to call getsockopt 39 | try: 40 | tmp_sd = socket.fromfd(fd, 0, 0, 0) 41 | except OSError: 42 | continue 43 | try: 44 | domain = tmp_sd.getsockopt(SOL_SOCKET, SO_DOMAIN) 45 | type = tmp_sd.getsockopt(SOL_SOCKET, SO_TYPE) 46 | protocol = tmp_sd.getsockopt(SOL_SOCKET, SO_PROTOCOL) 47 | except OSError: 48 | # not a socket 49 | pass 50 | else: 51 | if protocol != IPPROTO_UDP: 52 | pass 53 | else: 54 | sd = MockSocket(domain, type, protocol, fileno=fd) 55 | SOCKETS.append(sd) 56 | # tmp_sd is a dup, we must close it 57 | tmp_sd.close() 58 | return SOCKETS 59 | 60 | 61 | last_sock = None 62 | 63 | 64 | class Http3Server(QuicConnectionProtocol): 65 | def quic_event_received(self, event): 66 | global last_sock 67 | last_sock = self._transport._sock 68 | if isinstance(event, HandshakeCompleted): 69 | fd = self._transport._sock 70 | self.so_cookie, = struct.unpack( 71 | 'Q', fd.getsockopt(SOL_SOCKET, SO_COOKIE, 8)) 72 | self.sni = self._quic.tls.client_server_name or b'' 73 | self._http = H3Connection(self._quic) 74 | if getattr(self, "_http", None) is not None: 75 | for http_event in self._http.handle_event(event): 76 | self.http_event_received(http_event) 77 | 78 | def http_event_received(self, event): 79 | if isinstance(event, HeadersReceived): 80 | hdr = dict(event.headers) 81 | print("%08x %s %s %s" % 82 | (self.so_cookie, self.sni, hdr[b':authority'], hdr[b':path'])) 83 | stream_id = event.stream_id 84 | 85 | self._http.send_headers( 86 | stream_id=stream_id, 87 | headers=[ 88 | (b":status", b"200"), 89 | (b"content-type", b"text/plain") 90 | ] 91 | ) 92 | self._http.send_data( 93 | stream_id=stream_id, 94 | data=b"%08x %s Hello, HTTP/3!\n" % ( 95 | self.so_cookie, self.sni.encode()), 96 | end_stream=True 97 | ) 98 | 99 | 100 | def sock_to_str(s): 101 | so_cookie, = struct.unpack( 102 | 'Q', s.getsockopt(socket.SOL_SOCKET, SO_COOKIE, 8)) 103 | a, p = s.getsockname() 104 | if ':' in a: 105 | return '[%s]:%d (%08x)' % (a, p, so_cookie) 106 | return '%s:%d (%08x)' % (a, p, so_cookie) 107 | 108 | 109 | UDP_GRM_SOCKET_GEN = 201 110 | 111 | 112 | parser = argparse.ArgumentParser( 113 | prog='http3_simple_server', 114 | description='simple quic/http3 server') 115 | parser.add_argument('--crt', help="crt file", default="examples/cert.crt") 116 | parser.add_argument('--key', help="key file", default="examples/cert.key") 117 | parser.add_argument('listen', 118 | nargs='*', 119 | help='Address and port to bind to (like: 127.0.0.1:443 or [::1]:443)') 120 | 121 | 122 | class MockSocket(socket.socket): 123 | def __init__(self, *args, **kwargs): 124 | super().__init__(*args, **kwargs) 125 | self.captured_data = [] 126 | 127 | def recv(self, *args, **kwargs): 128 | print("A") 129 | global last_sock 130 | last_sock = self 131 | return super().recv(*args, **kwargs) 132 | 133 | def recvfrom(self, *args, **kwargs): 134 | global last_sock 135 | last_sock = self 136 | return super().recvfrom(*args, **kwargs) 137 | 138 | def read(self, *args, **kwargs): 139 | print("C") 140 | global last_sock 141 | last_sock = self 142 | return super().read(*args, **kwargs) 143 | 144 | 145 | async def main(args): 146 | sys.stdout.reconfigure(line_buffering=True) 147 | sd_inherited = get_inherited_sockets() 148 | sd_bound = [] 149 | for addr in args.listen: 150 | ip, _, port = addr.rpartition(':') 151 | port = int(port) 152 | ip = ipaddress.ip_address(ip.strip("[]")) 153 | family = socket.AF_INET if ip.version == 4 else socket.AF_INET6 154 | 155 | sock = MockSocket(family, SOCK_DGRAM) 156 | sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) 157 | sock.bind((str(ip), port)) 158 | sd_bound.append(sock) 159 | 160 | if sd_inherited: 161 | print('[*] Inherited %s' % ' '.join(sock_to_str(s) 162 | for s in sd_inherited)) 163 | if sd_bound: 164 | print('[*] Bound to %s' % ' '.join(sock_to_str(s) for s in sd_bound)) 165 | 166 | sockets = sd_inherited + sd_bound 167 | if not sockets: 168 | raise "Pass listen addr like 127.0.0.1:443, or use activate.py" 169 | 170 | def gen_cid(): 171 | fcookie = b'XXX' 172 | if last_sock: 173 | try: 174 | _sock_gen, sock_idx, fcookie, _ = struct.unpack( 175 | "IIHH", last_sock.getsockopt(socket.IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 12)) 176 | fcookie = socket.ntohs(fcookie) 177 | except OSError: 178 | print('err') 179 | pass 180 | cid = struct.pack("data; 22 | uint8_t *pkt_end = md->data_end; 23 | /* Advance UDP header */ 24 | pkt += 8; 25 | 26 | if (md->len < MIN_QUIC_LEN) { 27 | /* Packet too short */ 28 | return IERR_SANITY; 29 | } 30 | 31 | if (pkt + MIN_QUIC_LEN > pkt_end) { 32 | /* Non-linear packet */ 33 | int r = bpf_skb_load_bytes(md, 8, _packet, MIN_QUIC_LEN); 34 | if (r != 0) 35 | return IERR_LOAD; 36 | pkt = &_packet[0]; 37 | pkt_end = &_packet[MIN_QUIC_LEN]; 38 | } 39 | 40 | /* Fast path - check for is_long_header as fast as possible */ 41 | uint8_t hdr = pkt[0]; 42 | 43 | int is_long_header = hdr >> 7; 44 | int is_init_packet = (hdr >> 4) == 0xC; 45 | if (is_init_packet_ptr != NULL) 46 | *is_init_packet_ptr = is_init_packet; 47 | 48 | /* Fast path continues. Extract DCID. */ 49 | if (is_long_header) { 50 | uint8_t dcid_len = pkt[1 + 4]; 51 | if (dcid_len < 8 || dcid_len > 20) 52 | return IERR_SANITY; 53 | /* Ignore dcid_len, force 20 bytes */ 54 | memcpy(dcid, &pkt[1 + 4 + 1], 20); 55 | 56 | size_t i; 57 | for (i = dcid_len; i < 20; i++) { 58 | dcid->u8[i] = 0; 59 | asm volatile("" : : : "memory"); 60 | } 61 | 62 | if (dcid_len_ptr) 63 | *dcid_len_ptr = dcid_len; 64 | } else { 65 | /* No idea how long dcid really is */ 66 | memcpy(dcid, &pkt[1], 20); 67 | } 68 | 69 | return IERR_OK; 70 | } 71 | 72 | #define x_memcmp(a, _b, l) \ 73 | ({ \ 74 | const uint8_t b[] = _b; \ 75 | int r = 0; \ 76 | const size_t sz = sizeof(b) - 1; \ 77 | if (l != sz) { \ 78 | r = 1; \ 79 | } else { \ 80 | size_t i; \ 81 | for (i = 0; i < sz; i++) { \ 82 | if (a[i] != b[i]) { \ 83 | r = 1; \ 84 | break; \ 85 | } \ 86 | } \ 87 | } \ 88 | (r); \ 89 | }) 90 | 91 | static int run_0xDEAD(struct sk_reuseport_md *md, struct reuseport_storage *state, 92 | int *retval) 93 | { 94 | if (state == NULL || retval == NULL) 95 | return IERR_SANITY; 96 | const uint8_t verbose = state->verbose; 97 | 98 | struct dcid dcid = {}; 99 | uint8_t dcid_len = 0; 100 | uint8_t *sni = NULL; 101 | size_t sni_len = 0; 102 | int is_init_packet = 0; 103 | 104 | /* Four types of results: 105 | * - negative means hard error, most likely short packet 106 | * - zero AND sni means SNI found 107 | * - zero AND !sni means DCID found 108 | */ 109 | int r = parse_quic_fast(md, &dcid, &dcid_len, &is_init_packet); 110 | if (r != IERR_OK) { 111 | log_printf("[ ] Quic parse failed hard err=%d\n", r); 112 | return r; 113 | } 114 | if (verbose >= 3) 115 | log_printf_hex20("DCID", &dcid); 116 | 117 | if (is_init_packet) { 118 | /* This is slow path, parsing QUIC initial packet. This does 119 | * AES decryption, and SHA, don't overoptimize this for 120 | * speed. There is no need. */ 121 | 122 | struct scratch *scratch = percpu_scratch_page(); 123 | if (scratch == NULL) { 124 | return IERR_SANITY; 125 | } 126 | 127 | // Keys from DCID 128 | { 129 | /* DCID is copied to *tmp, then copy secret to scratch->secret */ 130 | expand_client_keys_from_dcid(scratch, &dcid, dcid_len); 131 | if (verbose >= 4) { 132 | log_printf_hex16("quic_key", &scratch->quic_key); 133 | log_printf_hex16("quic_iv", &scratch->quic_iv); 134 | log_printf_hex16("quic_hp", &scratch->quic_hp); 135 | } 136 | } 137 | 138 | size_t enc_offset = 0, packet_len = 0; 139 | /* Initial packet, slow path */ 140 | r = quic_parse_hdr(md, scratch, &enc_offset, &packet_len, verbose); 141 | /* IERR_LOAD - means problems with data load (perhaps wrong offsets?) 142 | * IERR_SANITY - failed assumptions, like dcid>20 143 | * IERR_BADINSTR - bad magic, like wrong quic version*/ 144 | if (r != IERR_OK) 145 | return r; 146 | 147 | r = quic_extract_sni(md, scratch, enc_offset, packet_len, verbose); 148 | if (r == IERR_OK && scratch->sni_len) { 149 | sni = scratch->sni; 150 | sni_len = scratch->sni_len; 151 | } else if (r == IERR_OK) { 152 | if (verbose >= 1) 153 | log_printf( 154 | "[ ] Not a recognized crypto frame, or borken " 155 | "encryption\n"); 156 | } else { 157 | if (verbose >= 1) 158 | log_printf("[ ] SNI failed %d\n", r); 159 | } 160 | } 161 | 162 | if (sni != NULL) { 163 | /* Advance SNI TLS extension header len */ 164 | sni += 5; 165 | sni_len -= 5; 166 | (void)sni_len; 167 | 168 | if (verbose >= 2) { 169 | /* tmp make short for log */ 170 | uint8_t x = sni[64]; 171 | sni[64] = '\x00'; 172 | if (verbose >= 2) 173 | log_printf(" SNI extracted: %s\n", sni); 174 | sni[64] = x; 175 | } 176 | 177 | uint32_t i; 178 | for (i = 0; i < MAX_BESPOKE_SNI; i++) { 179 | if (i >= state->dis.bespoke_hostname_len) 180 | break; 181 | /* Trunc at 61 bytes? */ 182 | r = memcmp(sni, state->dis.bespoke_sni[i].hostname, 183 | BESPOKE_SNI_LEN - 1); 184 | // log_printf("r=%d\n"); 185 | if (r == 0) { 186 | if (verbose >= 1) 187 | log_printf("[ ] SNI ok, app=1\n"); 188 | *retval = 0x80000000ULL | state->dis.bespoke_sni[i].app; 189 | return IERR_OK; 190 | } 191 | } 192 | if (verbose >= 1) 193 | log_printf("[ ] SNI not matched, dispatching to app=0\n"); 194 | *retval = 0x80000000ULL | 0x0; 195 | return IERR_OK; 196 | } 197 | 198 | /* Custom DCID */ 199 | if (dcid.u8[0] == 1) { 200 | uint16_t cookie = *(uint16_t *)&dcid.u8[1]; 201 | cookie = bswap16(cookie); 202 | if (cookie == 0) { 203 | uint8_t app = dcid.u8[4]; 204 | if (verbose >= 1) 205 | log_printf("[ ] DCID by app %d\n", app); 206 | *retval = 0x80000000ULL | app; 207 | } else { 208 | if (verbose >= 1) 209 | log_printf("[ ] DCID by cookie %04x\n", cookie); 210 | *retval = cookie; 211 | } 212 | return IERR_OK; 213 | } 214 | if (verbose >= 1) 215 | log_printf("[ ] DCID not ours, dispatching to app=0\n"); 216 | *retval = 0x80000000ULL | 0x0; 217 | return IERR_OK; 218 | } 219 | 220 | static int run_bespoke_by_digest(struct sk_reuseport_md *md, uint32_t bespoke_digest, 221 | struct reuseport_storage *state, int *retval) 222 | { 223 | (void)state; 224 | switch (bespoke_digest) { 225 | case 0xDEAD: { 226 | return run_0xDEAD(md, state, retval); 227 | } 228 | } 229 | 230 | return IERR_INSTREXCEEDED; 231 | } 232 | -------------------------------------------------------------------------------- /crates/udpgrm/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #![cfg(target_os = "linux")] 6 | 7 | use crate::types::{UdpGrmDissectorOpts, UdpGrmSocketGen}; 8 | use std::os::fd::{AsFd, AsRawFd, BorrowedFd}; 9 | use std::{io, mem}; 10 | 11 | pub mod types; 12 | 13 | pub trait UdpGrmSupport { 14 | /// Get working gen for socket group, ENOPROTOOPT if udpgrm cgroup hooks not present. 15 | fn get_working_gen(&self) -> io::Result; 16 | 17 | /// Get the socket generation for the socket. 18 | fn get_socket_gen(&self) -> io::Result; 19 | 20 | /// Get the app number for the socket. 21 | fn get_app_number(&self) -> io::Result; 22 | 23 | /// Change the current working generation for this socket set. 24 | fn set_working_gen(&self, working_gen: u32) -> io::Result<()>; 25 | 26 | /// Set a dissector for socket group. 27 | fn set_dissector(&self, dissector_opts: UdpGrmDissectorOpts) -> io::Result<()>; 28 | 29 | /// Set the socket gen of this socket. 30 | fn set_socket_gen(&self, working_gen: u32) -> io::Result<()>; 31 | 32 | /// Set the app number of this socket. 33 | fn set_app_number(&self, app_no: u32) -> io::Result<()>; 34 | } 35 | 36 | impl UdpGrmSupport for std::net::UdpSocket { 37 | fn get_working_gen(&self) -> io::Result { 38 | fd_get_working_gen(self.as_fd()) 39 | } 40 | 41 | fn get_socket_gen(&self) -> io::Result { 42 | fd_get_socket_gen(self.as_fd()) 43 | } 44 | 45 | fn get_app_number(&self) -> io::Result { 46 | fd_get_app_number(self.as_fd()) 47 | } 48 | 49 | fn set_working_gen(&self, working_gen: u32) -> io::Result<()> { 50 | fd_set_working_gen(self.as_fd(), working_gen) 51 | } 52 | 53 | fn set_dissector(&self, dissector_opts: UdpGrmDissectorOpts) -> io::Result<()> { 54 | fd_set_dissector(self.as_fd(), dissector_opts) 55 | } 56 | 57 | fn set_socket_gen(&self, working_gen: u32) -> io::Result<()> { 58 | fd_set_socket_gen(self.as_fd(), working_gen) 59 | } 60 | 61 | fn set_app_number(&self, app_no: u32) -> io::Result<()> { 62 | fd_set_app_number(self.as_fd(), app_no) 63 | } 64 | } 65 | 66 | #[cfg(feature = "socket2")] 67 | impl UdpGrmSupport for socket2::Socket { 68 | fn get_working_gen(&self) -> io::Result { 69 | fd_get_working_gen(self.as_fd()) 70 | } 71 | 72 | fn get_socket_gen(&self) -> io::Result { 73 | fd_get_socket_gen(self.as_fd()) 74 | } 75 | 76 | fn get_app_number(&self) -> io::Result { 77 | fd_get_app_number(self.as_fd()) 78 | } 79 | 80 | fn set_working_gen(&self, working_gen: u32) -> io::Result<()> { 81 | fd_set_working_gen(self.as_fd(), working_gen) 82 | } 83 | 84 | fn set_dissector(&self, dissector_opts: UdpGrmDissectorOpts) -> io::Result<()> { 85 | fd_set_dissector(self.as_fd(), dissector_opts) 86 | } 87 | 88 | fn set_socket_gen(&self, working_gen: u32) -> io::Result<()> { 89 | fd_set_socket_gen(self.as_fd(), working_gen) 90 | } 91 | 92 | fn set_app_number(&self, app_no: u32) -> io::Result<()> { 93 | fd_set_app_number(self.as_fd(), app_no) 94 | } 95 | } 96 | 97 | #[cfg(feature = "tokio")] 98 | impl UdpGrmSupport for tokio::net::UdpSocket { 99 | fn get_working_gen(&self) -> io::Result { 100 | fd_get_working_gen(self.as_fd()) 101 | } 102 | 103 | fn get_socket_gen(&self) -> io::Result { 104 | fd_get_socket_gen(self.as_fd()) 105 | } 106 | 107 | fn get_app_number(&self) -> io::Result { 108 | fd_get_app_number(self.as_fd()) 109 | } 110 | 111 | fn set_working_gen(&self, working_gen: u32) -> io::Result<()> { 112 | fd_set_working_gen(self.as_fd(), working_gen) 113 | } 114 | 115 | fn set_dissector(&self, dissector_opts: UdpGrmDissectorOpts) -> io::Result<()> { 116 | fd_set_dissector(self.as_fd(), dissector_opts) 117 | } 118 | 119 | fn set_socket_gen(&self, working_gen: u32) -> io::Result<()> { 120 | fd_set_socket_gen(self.as_fd(), working_gen) 121 | } 122 | 123 | fn set_app_number(&self, app_no: u32) -> io::Result<()> { 124 | fd_set_app_number(self.as_fd(), app_no) 125 | } 126 | } 127 | 128 | fn set_opt( 129 | sock: libc::c_int, 130 | opt: libc::c_int, 131 | val: libc::c_int, 132 | payload: T, 133 | ) -> io::Result<()> { 134 | unsafe { 135 | let payload = &payload as *const T as *const libc::c_void; 136 | cvt_linux_error(libc::setsockopt( 137 | sock, 138 | opt, 139 | val, 140 | payload as *const _, 141 | mem::size_of::() as libc::socklen_t, 142 | ))?; 143 | Ok(()) 144 | } 145 | } 146 | 147 | fn get_opt( 148 | sock: libc::c_int, 149 | opt: libc::c_int, 150 | val: libc::c_int, 151 | payload: &mut T, 152 | size: &mut libc::socklen_t, 153 | ) -> io::Result<()> { 154 | unsafe { 155 | let payload = payload as *mut T as *mut libc::c_void; 156 | cvt_linux_error(libc::getsockopt(sock, opt, val, payload as *mut _, size))?; 157 | Ok(()) 158 | } 159 | } 160 | 161 | #[cfg(target_os = "linux")] 162 | fn cvt_linux_error(t: i32) -> io::Result { 163 | if t == -1 { 164 | Err(io::Error::last_os_error()) 165 | } else { 166 | Ok(t) 167 | } 168 | } 169 | 170 | fn fd_set_dissector(socket: BorrowedFd, dissector_opts: UdpGrmDissectorOpts) -> io::Result<()> { 171 | set_opt( 172 | socket.as_raw_fd(), 173 | libc::IPPROTO_UDP, 174 | types::UDP_GRM_DISSECTOR, 175 | dissector_opts, 176 | )?; 177 | Ok(()) 178 | } 179 | 180 | fn fd_set_socket_gen(socket: BorrowedFd, socket_gen: u32) -> io::Result<()> { 181 | set_opt( 182 | socket.as_raw_fd(), 183 | libc::IPPROTO_UDP, 184 | types::UDP_GRM_SOCKET_GEN, 185 | socket_gen, 186 | )?; 187 | Ok(()) 188 | } 189 | 190 | fn fd_get_socket_gen(socket: BorrowedFd) -> io::Result { 191 | let mut s_gen = UdpGrmSocketGen::default(); 192 | let mut size = std::mem::size_of::() as u32; 193 | 194 | get_opt( 195 | socket.as_raw_fd(), 196 | libc::IPPROTO_UDP, 197 | types::UDP_GRM_SOCKET_GEN, 198 | &mut s_gen, 199 | &mut size, 200 | )?; 201 | Ok(s_gen) 202 | } 203 | 204 | fn fd_set_working_gen(socket: BorrowedFd, working_gen: u32) -> io::Result<()> { 205 | set_opt( 206 | socket.as_raw_fd(), 207 | libc::IPPROTO_UDP, 208 | types::UDP_GRM_WORKING_GEN, 209 | working_gen, 210 | )?; 211 | Ok(()) 212 | } 213 | 214 | /// Get working gen for socket, ENOPROTOOPT if not present. 215 | fn fd_get_working_gen(socket: BorrowedFd) -> io::Result { 216 | let mut w_gen: u32 = 0; 217 | let mut size = std::mem::size_of::() as u32; 218 | 219 | get_opt( 220 | socket.as_raw_fd(), 221 | libc::IPPROTO_UDP, 222 | types::UDP_GRM_WORKING_GEN, 223 | &mut w_gen, 224 | &mut size, 225 | )?; 226 | Ok(w_gen) 227 | } 228 | 229 | fn fd_get_app_number(socket: BorrowedFd) -> io::Result { 230 | let mut app_no: u32 = 0; 231 | let mut size = std::mem::size_of::() as u32; 232 | 233 | get_opt( 234 | socket.as_raw_fd(), 235 | libc::IPPROTO_UDP, 236 | types::UDP_GRM_SOCKET_APP, 237 | &mut app_no, 238 | &mut size, 239 | )?; 240 | Ok(app_no) 241 | } 242 | 243 | fn fd_set_app_number(socket: BorrowedFd, app_no: u32) -> io::Result<()> { 244 | set_opt( 245 | socket.as_raw_fd(), 246 | libc::IPPROTO_UDP, 247 | types::UDP_GRM_SOCKET_APP, 248 | app_no, 249 | )?; 250 | Ok(()) 251 | } 252 | -------------------------------------------------------------------------------- /src/do_list.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "common.h" 12 | 13 | static int gen_pointed_by_wrk_gen(uint32_t gen, struct reuseport_storage *s) 14 | { 15 | int app_idx; 16 | for (app_idx = 0; app_idx < MAX_APPS; app_idx++) { 17 | if (gen == TO_WRK_GEN(s->dis.max_apps, app_idx, s->working_gen[app_idx])) 18 | return app_idx; 19 | } 20 | return -1; 21 | } 22 | 23 | /* Makes dst zero-terminated if it's not */ 24 | static void safe_strcat(char *restrict dst, char *restrict src, size_t dst_sz) 25 | { 26 | size_t l = strnlen(dst, dst_sz); 27 | if (l < dst_sz) { 28 | size_t b = dst_sz - l; // must be >= 0 29 | size_t c = strlen(src); 30 | size_t ll = c < b ? c : b; 31 | memmove(&dst[l], src, ll); 32 | l += ll; 33 | } 34 | if (l == dst_sz) { 35 | l = dst_sz - 1; 36 | } 37 | dst[l] = '\x00'; 38 | } 39 | 40 | char *key_to_str(struct reuseport_storage_key *key) 41 | { 42 | char a[16]; 43 | char b[48]; 44 | inet_ntop(AF_INET, &key->src_ip4, a, sizeof(a)); 45 | inet_ntop(AF_INET6, &key->src_ip6, b, sizeof(b)); 46 | static char t[64]; 47 | if (key->family == AF_INET) { 48 | snprintf(t, sizeof(t), "%s:%d", a, key->src_port); 49 | 50 | } else if (key->family == AF_INET6) { 51 | snprintf(t, sizeof(t), "[%s]:%d", b, key->src_port); 52 | } else { 53 | snprintf(t, sizeof(t), "fam=%d type=dgram proto=udp %s/[%s]:%d", 54 | key->family, a, b, key->src_port); 55 | } 56 | return t; 57 | } 58 | 59 | static void _do_list(struct reuseport_storage_key *key, struct reuseport_storage *s, 60 | int verbose, int sockhash_fd, char *msg_note) 61 | { 62 | char *t = key_to_str(key); 63 | 64 | char a[32]; 65 | uint32_t dissector_type = s->dis.dissector_type & ~DISSECTOR_FLAGS; 66 | switch (dissector_type) { 67 | case DISSECTOR_FLOW: 68 | snprintf(a, sizeof(a), "flow"); 69 | break; 70 | case DISSECTOR_CBPF: 71 | snprintf(a, sizeof(a), "cbpf"); 72 | break; 73 | case DISSECTOR_BESPOKE: 74 | snprintf(a, sizeof(a), "bespoke"); 75 | break; 76 | case DISSECTOR_NOOP: 77 | snprintf(a, sizeof(a), "noop"); 78 | break; 79 | default: 80 | snprintf(a, sizeof(a), "%d", dissector_type); 81 | break; 82 | } 83 | 84 | printf("%s%s%s\n\tnetns 0x%lx dissector %s", t, msg_note != NULL ? " " : "", 85 | msg_note == NULL ? "" : msg_note, s->netns_cookie, a); 86 | if (dissector_type == DISSECTOR_FLOW) { 87 | printf(" flow_timeout_sec %u", s->dis.flow_entry_timeout_sec); 88 | } else if (dissector_type == DISSECTOR_CBPF) { 89 | printf(" apps %d filter_len %d", s->dis.max_apps, s->dis.filter_len); 90 | } else if (dissector_type == DISSECTOR_BESPOKE) { 91 | printf(" digest 0x%x", s->dis.bespoke_digest); 92 | } else { 93 | // No action needed 94 | } 95 | 96 | if (s->dis.label[0] != '\x00') { 97 | printf(" label %.*s", LABEL_SZ, s->dis.label); 98 | } 99 | 100 | if (s->verbose) { 101 | printf(" verbose"); 102 | } 103 | 104 | printf("\n"); 105 | 106 | if (sockhash_fd >= 0) { 107 | printf("\tsocket generations:\n"); 108 | uint32_t i, j; 109 | char line[4096]; // need at least 3230 bytes 110 | for (i = 0; i < MAX_GENS; i++) { 111 | int this_app = gen_pointed_by_wrk_gen(i, s); 112 | line[0] = '\x00'; 113 | for (j = 0; j < s->max_idx[i]; j++) { 114 | uint64_t c = s->cookies[i][j]; 115 | if (c == 0) 116 | break; 117 | 118 | uint64_t v = 0; 119 | int r = bpf_map_lookup_elem(sockhash_fd, &c, &v); 120 | if (verbose) { 121 | snprintf(a, sizeof(a), "%d:", j); 122 | safe_strcat(line, a, sizeof(line)); 123 | } 124 | 125 | if (r == -1 || v == 0) { 126 | // nonexistent socket. 127 | if (this_app == -1 && verbose == 0) { 128 | // Ignore in print unless 129 | // working_gen 130 | continue; 131 | } 132 | snprintf(a, sizeof(a), "dead "); 133 | } else { 134 | snprintf(a, sizeof(a), "0x%lx ", c); 135 | } 136 | safe_strcat(line, a, sizeof(line)); 137 | } 138 | if (strlen(line) > 0 || this_app != -1) { 139 | printf("\t\tgen %2d %s", i, line); 140 | if (this_app != -1) { 141 | printf(" <= "); 142 | 143 | int a; 144 | for (a = 0; a < MAX_APPS; a++) { 145 | if (i == TO_WRK_GEN(s->dis.max_apps, a, 146 | s->working_gen[a])) 147 | printf(" app %d gen %d", a, 148 | s->working_gen[a]); 149 | } 150 | } 151 | printf("\n"); 152 | } 153 | } 154 | } 155 | int metrics_cnt = 0; 156 | 157 | #define METRIC(token) \ 158 | if (s->token > 0 || verbose) { \ 159 | if (metrics_cnt++ == 0) \ 160 | printf("\tmetrics:\n"); \ 161 | printf("\t\t" #token " %lu\n", s->token); \ 162 | } 163 | 164 | METRIC(socket_critical_gauge); 165 | METRIC(socket_critical); 166 | 167 | METRIC(rx_processed_total); 168 | METRIC(rx_internal_state_error); 169 | METRIC(rx_cbpf_prog_error); 170 | METRIC(rx_packet_too_short_error); 171 | 172 | METRIC(rx_dissected_ok_total); 173 | METRIC(rx_flow_ok); 174 | METRIC(rx_flow_rg_conflict); 175 | METRIC(rx_flow_other_error); 176 | METRIC(rx_flow_new_unseen); 177 | METRIC(rx_flow_new_had_expired); 178 | METRIC(rx_flow_new_bad_cookie); 179 | 180 | METRIC(rx_new_flow_total); 181 | METRIC(rx_new_flow_working_gen_dispatch_ok); 182 | METRIC(rx_new_flow_working_gen_dispatch_error); 183 | 184 | METRIC(tx_total); 185 | METRIC(tx_flow_create_ok); 186 | METRIC(tx_flow_create_from_expired_ok); 187 | METRIC(tx_flow_create_error); 188 | METRIC(tx_flow_update_ok); 189 | METRIC(tx_flow_update_conflict); 190 | } 191 | 192 | void do_list(int prog_fd, int map_fd, struct sockaddr_storage *reuseport_ss, int verbose) 193 | { 194 | int sockhash_fd = map_from_prog(prog_fd, "sockhash", NULL); 195 | /* allow it to be not found/-1 */ 196 | 197 | struct reuseport_storage_key key = {}; 198 | int err = 0; 199 | 200 | if (reuseport_ss->ss_family != AF_UNSPEC) { 201 | skey_from_ss(&key, reuseport_ss); 202 | } else { 203 | bpf_map_get_next_key(map_fd, NULL, &key); 204 | } 205 | while (!err) { 206 | struct reuseport_storage s = {}; 207 | int r = bpf_map_lookup_elem(map_fd, &key, &s); 208 | if (r == 0) { 209 | _do_list(&key, &s, verbose, sockhash_fd, NULL); 210 | } 211 | 212 | if (reuseport_ss->ss_family != AF_UNSPEC) { 213 | // finish loop 214 | break; 215 | } 216 | 217 | err = bpf_map_get_next_key(map_fd, &key, &key); 218 | } 219 | 220 | struct bpf_map_info good_map_info = {}; 221 | uint32_t info_len = sizeof(good_map_info); 222 | int r = bpf_obj_get_info_by_fd(map_fd, &good_map_info, &info_len); 223 | if (r != 0) { 224 | error(-1, errno, "bpf_obj_get_info_by_fd()"); 225 | } 226 | 227 | int *map_fd_list = map_by_name("reuseport_stora", map_fd_to_id(map_fd)); 228 | int map_fd_cnt = 0; 229 | for (map_fd_cnt = 0; map_fd_list && map_fd_list[map_fd_cnt] >= 0; map_fd_cnt++) { 230 | int map_fd = map_fd_list[map_fd_cnt]; 231 | 232 | struct bpf_map_info map_info = {}; 233 | uint32_t info_len = sizeof(map_info); 234 | int r = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); 235 | if (r != 0) { 236 | error(-1, errno, "bpf_obj_get_info_by_fd()"); 237 | } 238 | 239 | if (good_map_info.key_size != map_info.key_size) { 240 | fprintf(stderr, "[!] map %u seems off, skipping\n", map_info.id); 241 | } 242 | 243 | struct reuseport_storage_key key = {}; 244 | if (reuseport_ss->ss_family != AF_UNSPEC) { 245 | skey_from_ss(&key, reuseport_ss); 246 | } else { 247 | bpf_map_get_next_key(map_fd, NULL, &key); 248 | } 249 | err = 0; 250 | while (!err) { 251 | struct reuseport_storage s = {}; 252 | int r = bpf_map_lookup_elem(map_fd, &key, &s); 253 | if (r == 0) { 254 | _do_list(&key, &s, verbose, -1, "(old)"); 255 | } 256 | 257 | if (reuseport_ss->ss_family != AF_UNSPEC) { 258 | // finish loop 259 | break; 260 | } 261 | 262 | err = bpf_map_get_next_key(map_fd, &key, &key); 263 | } 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /crates/udpgrm/examples/client.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | use clap::Parser; 6 | use ring::rand::SecureRandom; 7 | use ring::rand::SystemRandom; 8 | 9 | const MAX_DATAGRAM_SIZE: usize = 1350; 10 | 11 | #[derive(Parser, Debug)] 12 | #[command(author, version, about, long_about = None)] 13 | struct Args { 14 | #[arg(short, long)] 15 | verbose: bool, 16 | 17 | #[arg(short, long, value_name = "IP:PORT", value_parser = clap::value_parser!(std::net::SocketAddr))] 18 | target: std::net::SocketAddr, 19 | 20 | #[arg(value_name = "URL")] 21 | url: Vec, 22 | 23 | #[arg(short, long)] 24 | deterministic: bool, 25 | } 26 | 27 | fn main() { 28 | let args = Args::parse(); 29 | 30 | let mut buf = [0; 65535]; 31 | let mut out = [0; MAX_DATAGRAM_SIZE]; 32 | 33 | let mut config = quiche::Config::new(quiche::PROTOCOL_VERSION).unwrap(); 34 | // *CAUTION*: this should not be set to `false` in production!!! 35 | config.verify_peer(false); 36 | 37 | config 38 | .set_application_protos(quiche::h3::APPLICATION_PROTOCOL) 39 | .unwrap(); 40 | 41 | config.set_max_idle_timeout(5000); 42 | config.set_max_recv_udp_payload_size(MAX_DATAGRAM_SIZE); 43 | config.set_max_send_udp_payload_size(MAX_DATAGRAM_SIZE); 44 | config.set_initial_max_data(10_000_000); 45 | config.set_initial_max_stream_data_bidi_local(1_000_000); 46 | config.set_initial_max_stream_data_bidi_remote(1_000_000); 47 | config.set_initial_max_stream_data_uni(1_000_000); 48 | config.set_initial_max_streams_bidi(100); 49 | config.set_initial_max_streams_uni(100); 50 | config.set_disable_active_migration(true); 51 | 52 | for url in args.url { 53 | let url = url; 54 | 55 | // Setup the event loop. 56 | let mut poll = mio::Poll::new().unwrap(); 57 | let mut events = mio::Events::with_capacity(1024); 58 | 59 | let bind_addr = match args.target { 60 | std::net::SocketAddr::V4(_) => "0.0.0.0:0", 61 | std::net::SocketAddr::V6(_) => "[::]:0", 62 | }; 63 | 64 | let mut socket = mio::net::UdpSocket::bind(bind_addr.parse().unwrap()).unwrap(); 65 | poll.registry() 66 | .register(&mut socket, mio::Token(0), mio::Interest::READABLE) 67 | .unwrap(); 68 | 69 | let mut http3_conn = None; 70 | 71 | let mut scid = [0; quiche::MAX_CONN_ID_LEN]; 72 | if args.deterministic { 73 | scid[..4].copy_from_slice(b"\xDE\xAD\xBA\xBE"); 74 | } else { 75 | SystemRandom::new().fill(&mut scid[..]).unwrap(); 76 | } 77 | 78 | let scid = quiche::ConnectionId::from_ref(&scid); 79 | 80 | // Get local address. 81 | let local_addr = socket.local_addr().unwrap(); 82 | 83 | // Create a QUIC connection and initiate handshake. 84 | let mut conn = 85 | quiche::connect(url.domain(), &scid, local_addr, args.target, &mut config).unwrap(); 86 | 87 | let (write, send_info) = conn.send(&mut out).expect("initial send failed"); 88 | 89 | while let Err(e) = socket.send_to(&out[..write], send_info.to) { 90 | if e.kind() == std::io::ErrorKind::WouldBlock { 91 | continue; 92 | } 93 | 94 | panic!("send() failed: {:?}", e); 95 | } 96 | 97 | let h3_config = quiche::h3::Config::new().unwrap(); 98 | 99 | let mut path = String::from(url.path()); 100 | 101 | if let Some(query) = url.query() { 102 | path.push('?'); 103 | path.push_str(query); 104 | } 105 | 106 | let req = vec![ 107 | quiche::h3::Header::new(b":method", b"GET"), 108 | quiche::h3::Header::new(b":scheme", url.scheme().as_bytes()), 109 | quiche::h3::Header::new(b":authority", url.host_str().unwrap().as_bytes()), 110 | quiche::h3::Header::new(b":path", path.as_bytes()), 111 | quiche::h3::Header::new(b"user-agent", b"quiche"), 112 | ]; 113 | 114 | let mut req_sent = false; 115 | 116 | loop { 117 | poll.poll(&mut events, conn.timeout()).unwrap(); 118 | 119 | 'read: loop { 120 | if events.is_empty() { 121 | conn.on_timeout(); 122 | 123 | break 'read; 124 | } 125 | 126 | let (len, from) = match socket.recv_from(&mut buf) { 127 | Ok(v) => v, 128 | 129 | Err(e) => { 130 | if e.kind() == std::io::ErrorKind::WouldBlock { 131 | break 'read; 132 | } 133 | 134 | panic!("recv() failed: {:?}", e); 135 | } 136 | }; 137 | 138 | let recv_info = quiche::RecvInfo { 139 | to: local_addr, 140 | from, 141 | }; 142 | 143 | // Process potentially coalesced packets. 144 | let _read = match conn.recv(&mut buf[..len], recv_info) { 145 | Ok(v) => v, 146 | 147 | Err(e) => { 148 | println!("[!] recv failed: {:?}", e); 149 | continue 'read; 150 | } 151 | }; 152 | } 153 | 154 | if conn.is_closed() { 155 | break; 156 | } 157 | 158 | // Create a new HTTP/3 connection once the QUIC connection is established. 159 | if conn.is_established() && http3_conn.is_none() { 160 | http3_conn = Some( 161 | quiche::h3::Connection::with_transport(&mut conn, &h3_config) 162 | .expect("Unable to create HTTP/3 connection, check the server's uni stream limit and window size"), 163 | ); 164 | } 165 | 166 | // Send HTTP requests once the QUIC connection is established, and until 167 | // all requests have been sent. 168 | if let Some(h3_conn) = &mut http3_conn { 169 | if !req_sent { 170 | h3_conn.send_request(&mut conn, &req, true).unwrap(); 171 | 172 | req_sent = true; 173 | } 174 | } 175 | 176 | if let Some(http3_conn) = &mut http3_conn { 177 | // Process HTTP/3 events. 178 | loop { 179 | match http3_conn.poll(&mut conn) { 180 | Ok((_stream_id, quiche::h3::Event::Headers { .. })) => {} 181 | 182 | Ok((stream_id, quiche::h3::Event::Data)) => { 183 | while let Ok(read) = 184 | http3_conn.recv_body(&mut conn, stream_id, &mut buf) 185 | { 186 | print!("{}", unsafe { 187 | std::str::from_utf8_unchecked(&buf[..read]) 188 | }); 189 | } 190 | } 191 | 192 | Ok((_stream_id, quiche::h3::Event::Finished)) => { 193 | conn.close(true, 0x100, b"kthxbye").unwrap(); 194 | } 195 | 196 | Ok((_stream_id, quiche::h3::Event::Reset(e))) => { 197 | println!("[!] request was reset by peer with {}, closing...", e); 198 | 199 | conn.close(true, 0x100, b"kthxbye").unwrap(); 200 | } 201 | 202 | Ok((_, quiche::h3::Event::PriorityUpdate)) => unreachable!(), 203 | 204 | Ok((_goaway_id, quiche::h3::Event::GoAway)) => {} 205 | 206 | Err(quiche::h3::Error::Done) => { 207 | break; 208 | } 209 | 210 | Err(e) => { 211 | println!("[!] HTTP/3 processing failed: {:?}", e); 212 | 213 | break; 214 | } 215 | } 216 | } 217 | } 218 | 219 | loop { 220 | let (write, send_info) = match conn.send(&mut out) { 221 | Ok(v) => v, 222 | 223 | Err(quiche::Error::Done) => { 224 | break; 225 | } 226 | 227 | Err(e) => { 228 | println!("[!] send failed: {:?}", e); 229 | 230 | conn.close(false, 0x1, b"fail").ok(); 231 | break; 232 | } 233 | }; 234 | 235 | if let Err(e) = socket.send_to(&out[..write], send_info.to) { 236 | if e.kind() == std::io::ErrorKind::WouldBlock { 237 | break; 238 | } 239 | 240 | panic!("send() failed: {:?}", e); 241 | } 242 | } 243 | 244 | if conn.is_closed() { 245 | break; 246 | } 247 | } 248 | } 249 | } 250 | -------------------------------------------------------------------------------- /examples/echoserver.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | # Copyright (c) 2025 Cloudflare, Inc. 4 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 5 | # https://opensource.org/licenses/Apache-2.0 6 | 7 | ''' 8 | Example of DISSECTOR_FLOW. 9 | 10 | $ sudo systemd-run \ 11 | --unit echoserver \ 12 | -p Type=notify \ 13 | -p NotifyAccess=all \ 14 | -p FileDescriptorStoreMax=128\ 15 | -p ExecStartPre="$PWD/udpgrm --install --self" \ 16 | -p ExecStartPre="$PWD/tools/udpgrm_activate.py \ 17 | --no-register \ 18 | --count=8 \ 19 | xxx 0.0.0.0:4433" \ 20 | -p KillMode=process \ 21 | -p KillSignal=SIGTERM \ 22 | -p Restart=always \ 23 | -- $PWD/mmdecoy \ 24 | -- $PWD/examples/venv/bin/python3 $PWD/examples/echoserver.py 25 | 26 | $ nc -u 127.0.0.1 4433 27 | hello world 28 | sk=0x0016876f data=b'hello wo' 29 | 30 | $ sudo ./udpgrm flows 31 | [ ] Retrievieng BPF progs from /sys/fs/bpf/udpgrm 32 | 0.0.0.0:4433 33 | so_cookie 0x16876f 34 | 1807193a age 18.1s 35 | 36 | ''' 37 | 38 | import subprocess 39 | import socket 40 | import select 41 | from socket import (SOL_SOCKET, SO_DOMAIN, SO_TYPE, SO_PROTOCOL) 42 | 43 | import os 44 | import sys 45 | import struct 46 | import ipaddress 47 | import itertools 48 | import time 49 | import signal 50 | from systemd.daemon import notify 51 | 52 | 53 | socket.IP_PKTINFO = 8 54 | socket.SO_COOKIE = 57 55 | 56 | 57 | SOCKETS = {} 58 | 59 | RPCOUNT = 8 60 | 61 | sys.stdout.reconfigure(line_buffering=True) 62 | 63 | 64 | def sockets_from_activation(): 65 | listenfds = int(os.environ.get('LISTEN_FDS', '0')) 66 | fdnames = os.environ.get('LISTEN_FDNAMES', None) 67 | if fdnames: 68 | fdnames = fdnames.split(':') 69 | else: 70 | fdnames = [] 71 | 72 | if len(fdnames) != listenfds: 73 | raise OSError("LISTEN_FDS doesn't match LISTEN_FDNAMES") 74 | 75 | SOCKETS = [] 76 | for fd, fdname in zip(range(3, listenfds+3), fdnames): 77 | # In python we need socket object to call getsockopt 78 | tmp_sd = socket.fromfd(fd, 0, 0, 0) 79 | try: 80 | domain = tmp_sd.getsockopt(SOL_SOCKET, SO_DOMAIN) 81 | type = tmp_sd.getsockopt(SOL_SOCKET, SO_TYPE) 82 | protocol = tmp_sd.getsockopt(SOL_SOCKET, SO_PROTOCOL) 83 | except OSError: 84 | # not a socket 85 | pass 86 | else: 87 | sd = socket.socket(domain, type, protocol, fileno=fd) 88 | SOCKETS.append((fdname, sd)) 89 | # tmp_sd is a dup, we must close it 90 | tmp_sd.close() 91 | return SOCKETS 92 | 93 | 94 | def addr_to_str(addr): 95 | ip_s = addr[0] if ':' not in addr[0] else "[%s]" % (addr[0],) 96 | return "%s:%d" % (ip_s, addr[1]) 97 | 98 | 99 | for addr in sys.argv[1:]: 100 | ip, separator, port = addr.rpartition(':') 101 | port = int(port) 102 | ip = ipaddress.ip_address(ip.strip("[]")) 103 | family = socket.AF_INET if ip.version == 4 else socket.AF_INET6 104 | 105 | addr = (str(ip), port) 106 | for i in range(RPCOUNT): 107 | sd = socket.socket(family, socket.SOCK_DGRAM) 108 | if ip.version == 4: 109 | sd.setsockopt(socket.IPPROTO_IP, socket.IP_PKTINFO, 1) 110 | else: 111 | sd.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_RECVPKTINFO, 1) 112 | sd.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, 1) 113 | 114 | sd.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) 115 | 116 | sd.bind(addr) 117 | if i == 0 and addr[1] == 0: 118 | addr = sd.getsockname() 119 | if i == 0: 120 | print("%d [+] Opening %d sockets %s" % 121 | (os.getpid(), RPCOUNT, addr_to_str(addr))) 122 | 123 | SOCKETS.setdefault(addr, []).append(sd) 124 | 125 | for sdname, sd in sockets_from_activation(): 126 | family = sd.getsockopt(SOL_SOCKET, SO_DOMAIN) 127 | assert sd.getsockopt(SOL_SOCKET, SO_TYPE), SOCK_DGRAM 128 | addr = sd.getsockname() 129 | 130 | print("%d [+] Socket from acivation %s: %s" % 131 | (os.getpid(), sdname, addr_to_str(addr,))) 132 | 133 | if family == socket.AF_INET: 134 | sd.setsockopt(socket.IPPROTO_IP, socket.IP_PKTINFO, 1) 135 | else: 136 | sd.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_RECVPKTINFO, 1) 137 | 138 | SOCKETS.setdefault(addr, []).append(sd) 139 | 140 | 141 | UDP_GRM_WORKING_GEN = 200 142 | UDP_GRM_SOCKET_GEN = 201 143 | UDP_GRM_DISSECTOR = 202 144 | 145 | for addr, sockets in SOCKETS.items(): 146 | sd = sockets[0] 147 | try: 148 | wrk_gen = sd.getsockopt(socket.IPPROTO_UDP, UDP_GRM_WORKING_GEN) 149 | except OSError: 150 | print("%d [!] Failed: udpgrm not loaded" % (os.getpid(),)) 151 | break 152 | for s in sockets: 153 | try: 154 | s.setsockopt(socket.IPPROTO_UDP, UDP_GRM_SOCKET_GEN, wrk_gen + 1) 155 | except BlockingIOError: 156 | print("%d [ ] blocking" % (os.getpid(),)) 157 | time.sleep(0.2) 158 | s.setsockopt(socket.IPPROTO_UDP, UDP_GRM_SOCKET_GEN, wrk_gen + 1) 159 | 160 | # max 1s wait 161 | t0 = time.time() 162 | for s in sockets: 163 | while t0 + 1 > time.time(): 164 | v = s.getsockopt(socket.IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 8) 165 | sk_gen, sk_idx = struct.unpack('II', v) 166 | if sk_idx != 0xffffffff: 167 | break 168 | os.sched_yield() 169 | time.sleep(0.01) 170 | else: 171 | print('%d [!] Failed to sync with udpgrm' % (os.getpid(),)) 172 | sd.setsockopt(socket.IPPROTO_UDP, UDP_GRM_WORKING_GEN, wrk_gen + 1) 173 | print("%d [ ] udpgrm: %s working gen %d -> %d" % 174 | (os.getpid(), addr_to_str(addr), wrk_gen, wrk_gen+1)) 175 | 176 | 177 | def unpack_cmsg(cmsg): 178 | addr = None 179 | for cmsg_level, cmsg_type, data in cmsg: 180 | if cmsg_level == socket.IPPROTO_IPV6 and cmsg_type == socket.IPV6_PKTINFO: 181 | # struct in6_pktinfo { 182 | # struct in6_addr ipi6_addr; 16 bytes 183 | # int ipi6_ifindex; 4 bytes 184 | # }; 185 | addr_bin, if_index = struct.unpack("16sI", data) 186 | addr = socket.inet_ntop(socket.AF_INET6, addr_bin) 187 | addr = (addr, 0, 0, if_index) 188 | elif cmsg_level == socket.IPPROTO_IP and cmsg_type == socket.IP_PKTINFO: 189 | # struct in_pktinfo { 190 | # unsigned int ipi_ifindex; 4 bytes 191 | # struct in_addr ipi_spec_dst; 4 bytes 192 | # struct in_addr ipi_addr; 4 bytes 193 | # }; 194 | if_index, spec_dst_bin, addr_bin = struct.unpack("I4s4s", data) 195 | addr = socket.inet_ntop(socket.AF_INET, addr_bin) 196 | addr = (addr, 0, if_index) 197 | else: 198 | assert (0) 199 | return addr 200 | 201 | 202 | def pack_pktinfo_cmsg(local_addr): 203 | if ':' not in local_addr[0]: 204 | local_addr_bin = socket.inet_pton(socket.AF_INET, local_addr[0]) 205 | return (socket.IPPROTO_IP, 206 | socket.IP_PKTINFO, 207 | struct.pack("I4s4s", local_addr[2], local_addr_bin, b'\x00' * 4)) 208 | else: 209 | local_addr_bin = socket.inet_pton(socket.AF_INET6, local_addr[0]) 210 | return (socket.IPPROTO_IPV6, 211 | socket.IPV6_PKTINFO, 212 | struct.pack("16si", local_addr_bin, local_addr[3])) 213 | 214 | 215 | sigint = 0 216 | gracefull_quit = False 217 | 218 | # Exit graceful when no msg after 15 sec 219 | loop_in_seconds = 15 220 | 221 | decoy_pid = os.getppid() 222 | 223 | 224 | def sigint_handler(sig, frame): 225 | global sigint, gracefull_quit 226 | sigint += 1 227 | if sigint > 1: 228 | print("%d [!] SIG%d received second time, terminating" % 229 | (os.getpid(), sig)) 230 | sys.exit(0) 231 | else: 232 | notify("STOPPING=1") 233 | print("%d [!] SIG%d received, gracefull stop (decoy=%d)" % 234 | (os.getpid(), sig, decoy_pid)) 235 | # reload / restart 236 | os.kill(decoy_pid, signal.SIGURG) 237 | gracefull_quit = True 238 | 239 | 240 | signal.signal(signal.SIGINT, sigint_handler) 241 | signal.signal(signal.SIGTERM, sigint_handler) 242 | signal.signal(signal.SIGHUP, sigint_handler) 243 | 244 | 245 | LIST_OF_SOCKETS = list(itertools.chain(*SOCKETS.values())) 246 | 247 | if not LIST_OF_SOCKETS: 248 | print("%d [!] no sockets, exiting" % (os.getpid(),)) 249 | sys.exit(0) 250 | 251 | notify("READY=1") 252 | 253 | while True: 254 | rd, _, _ = select.select(LIST_OF_SOCKETS, [], [], loop_in_seconds) 255 | for sd in rd: 256 | data, cmsg, _flg, remote_addr = sd.recvmsg(1024*64, 256) 257 | local_addr = unpack_cmsg(cmsg) 258 | 259 | sk_cookie = sd.getsockopt(socket.SOL_SOCKET, socket.SO_COOKIE, 8) 260 | sk_cookie, = struct.unpack('Q', sk_cookie) 261 | cmsg = pack_pktinfo_cmsg(local_addr) 262 | buf = ("sk=0x%08x data=%s\n" % (sk_cookie, repr(data[:8]))).encode() 263 | sd.sendmsg([buf], [cmsg], 0, remote_addr) 264 | if not rd and gracefull_quit: 265 | print("%d [!] All flows drained, quitting" % (os.getpid(),)) 266 | break 267 | -------------------------------------------------------------------------------- /crates/udpgrm/examples/udpserver-flow.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 Cloudflare, Inc. 2 | // Licensed under the Apache 2.0 license found in the LICENSE file or at: 3 | // https://opensource.org/licenses/Apache-2.0 4 | 5 | use libc::{sockaddr, sockaddr_in, sockaddr_in6, sockaddr_storage, socklen_t, AF_INET, AF_INET6}; 6 | use nix; 7 | 8 | use std::io; 9 | use std::io::{IoSlice, IoSliceMut}; 10 | 11 | use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6, UdpSocket}; 12 | use std::os::fd::FromRawFd; 13 | use std::os::fd::{AsFd, AsRawFd}; 14 | use udpgrm::types::{UdpGrmDissectorOpts, UdpGrmDissectorType}; 15 | use udpgrm::UdpGrmSupport; 16 | fn std_addr_to_c(addr: &SocketAddr, out: &mut sockaddr_storage) -> socklen_t { 17 | let sin_port = addr.port().to_be(); 18 | 19 | match addr { 20 | SocketAddr::V4(addr) => unsafe { 21 | let out_in = out as *mut _ as *mut sockaddr_in; 22 | 23 | let s_addr = u32::from_ne_bytes(addr.ip().octets()); 24 | let sin_addr = libc::in_addr { s_addr }; 25 | 26 | *out_in = sockaddr_in { 27 | sin_family: AF_INET as libc::sa_family_t, 28 | sin_addr, 29 | sin_port, 30 | sin_zero: std::mem::zeroed(), 31 | }; 32 | 33 | std::mem::size_of::() as socklen_t 34 | }, 35 | 36 | SocketAddr::V6(addr) => unsafe { 37 | let out_in6 = out as *mut _ as *mut sockaddr_in6; 38 | 39 | let sin6_addr = libc::in6_addr { 40 | s6_addr: addr.ip().octets(), 41 | }; 42 | 43 | *out_in6 = sockaddr_in6 { 44 | sin6_family: AF_INET6 as libc::sa_family_t, 45 | sin6_addr, 46 | sin6_port: sin_port, 47 | sin6_flowinfo: addr.flowinfo(), 48 | sin6_scope_id: addr.scope_id(), 49 | }; 50 | 51 | std::mem::size_of::() as socklen_t 52 | }, 53 | } 54 | } 55 | 56 | fn std_addr_from_c(addr: &sockaddr, addr_len: socklen_t) -> SocketAddr { 57 | match addr.sa_family as i32 { 58 | AF_INET => { 59 | assert!(addr_len as usize == std::mem::size_of::()); 60 | 61 | let in4 = unsafe { *(addr as *const _ as *const sockaddr_in) }; 62 | let ip_addr = Ipv4Addr::from(u32::from_be(in4.sin_addr.s_addr)); 63 | let port = u16::from_be(in4.sin_port); 64 | let out = SocketAddrV4::new(ip_addr, port); 65 | out.into() 66 | } 67 | 68 | AF_INET6 => { 69 | assert!(addr_len as usize == std::mem::size_of::()); 70 | 71 | let in6 = unsafe { *(addr as *const _ as *const sockaddr_in6) }; 72 | let ip_addr = Ipv6Addr::from(in6.sin6_addr.s6_addr); 73 | let port = u16::from_be(in6.sin6_port); 74 | let scope_id = in6.sin6_scope_id; 75 | let out = SocketAddrV6::new(ip_addr, port, in6.sin6_flowinfo, scope_id); 76 | out.into() 77 | } 78 | 79 | _ => unimplemented!("unsupported address type"), 80 | } 81 | } 82 | 83 | fn cvt_linux_error(t: i32) -> io::Result { 84 | if t == -1 { 85 | Err(io::Error::last_os_error()) 86 | } else { 87 | Ok(t) 88 | } 89 | } 90 | 91 | fn udp_socket_bind_with_control( 92 | addr: &SocketAddr, 93 | control: impl Fn(libc::c_int) -> io::Result<()>, 94 | ) -> io::Result { 95 | let fd = cvt_linux_error(unsafe { 96 | libc::socket( 97 | libc::AF_INET, 98 | libc::SOCK_DGRAM | libc::SOCK_NONBLOCK, 99 | 0 as libc::c_int, 100 | ) 101 | })?; 102 | 103 | control(fd)?; 104 | 105 | let mut ss: sockaddr_storage = unsafe { std::mem::zeroed() }; 106 | let len = std_addr_to_c(&addr, &mut ss); 107 | 108 | cvt_linux_error(unsafe { 109 | libc::bind( 110 | fd, 111 | &ss as *const libc::sockaddr_storage as *const sockaddr, 112 | len, 113 | ) 114 | })?; 115 | Ok(unsafe { UdpSocket::from_raw_fd(fd) }) 116 | } 117 | 118 | fn get_so_cookie(raw_fd: i32) -> io::Result { 119 | let mut val: u64 = 0; 120 | //let val = &mut _val as *mut libc::c_void; 121 | let mut size: libc::socklen_t = std::mem::size_of::() as u32; 122 | cvt_linux_error(unsafe { 123 | libc::getsockopt( 124 | raw_fd, 125 | libc::SOL_SOCKET, 126 | libc::SO_COOKIE, 127 | &mut val as *mut u64 as *mut libc::c_void, 128 | &mut size, 129 | ) 130 | })?; 131 | Ok(val) 132 | } 133 | 134 | fn main() { 135 | let listen_addr = "0.0.0.0:5201".parse::().unwrap(); 136 | 137 | let sd = udp_socket_bind_with_control(&listen_addr, |raw_fd| { 138 | let one: u32 = 1; 139 | cvt_linux_error(unsafe { 140 | libc::setsockopt( 141 | raw_fd, 142 | libc::SOL_SOCKET, 143 | libc::SO_REUSEPORT, 144 | &one as *const _ as *const libc::c_void, 145 | std::mem::size_of::() as libc::socklen_t, 146 | ) 147 | })?; 148 | 149 | if listen_addr.is_ipv4() { 150 | cvt_linux_error(unsafe { 151 | libc::setsockopt( 152 | raw_fd, 153 | libc::IPPROTO_IP, 154 | libc::IP_PKTINFO, 155 | &one as *const _ as *const libc::c_void, 156 | std::mem::size_of::() as libc::socklen_t, 157 | ) 158 | })?; 159 | } else { 160 | cvt_linux_error(unsafe { 161 | libc::setsockopt( 162 | raw_fd, 163 | libc::IPPROTO_IPV6, 164 | libc::IPV6_RECVPKTINFO, 165 | &one as *const _ as *const libc::c_void, 166 | std::mem::size_of::() as libc::socklen_t, 167 | ) 168 | })?; 169 | cvt_linux_error(unsafe { 170 | libc::setsockopt( 171 | raw_fd, 172 | libc::IPPROTO_IPV6, 173 | libc::IPV6_V6ONLY, 174 | &one as *const _ as *const libc::c_void, 175 | std::mem::size_of::() as libc::socklen_t, 176 | ) 177 | })?; 178 | } 179 | 180 | Ok(()) 181 | }) 182 | .unwrap(); 183 | 184 | let mut opts = UdpGrmDissectorOpts::default(); 185 | opts.dissector_type = UdpGrmDissectorType::DissectorFlow; 186 | match sd.set_dissector(opts) { 187 | Ok(_) => (), 188 | Err(e) if e.raw_os_error().unwrap() == libc::ENOPROTOOPT => { 189 | panic!("[!] cgroups hooks not loaded"); 190 | } 191 | Err(e) => { 192 | panic!("Failed to get working generation: {e}"); 193 | } 194 | } 195 | 196 | let gen = sd.get_working_gen().unwrap(); 197 | sd.set_socket_gen(gen + 1).unwrap(); 198 | sd.set_working_gen(gen + 1).unwrap(); 199 | 200 | let raw_fd = sd.as_raw_fd(); 201 | loop { 202 | let mut readfds = nix::sys::select::FdSet::new(); 203 | readfds.insert(sd.as_fd()); 204 | let _ = nix::sys::select::select(raw_fd + 1, &mut readfds, None, None, None); 205 | for fd in readfds.fds(None) { 206 | let mut buf = [0; 4096]; 207 | let x = IoSliceMut::new(&mut buf); 208 | let io_vec = &mut [x]; 209 | let mut cmsg_buf = nix::cmsg_space!([u8; 512]); 210 | let msg: nix::sys::socket::RecvMsg = 211 | nix::sys::socket::recvmsg( 212 | fd.as_raw_fd(), 213 | io_vec, 214 | Some(&mut cmsg_buf), 215 | nix::sys::socket::MsgFlags::empty(), 216 | ) 217 | .unwrap(); 218 | 219 | use nix::sys::socket::ControlMessage; 220 | use nix::sys::socket::ControlMessageOwned; 221 | 222 | let mut tx_cmsgs: Vec = vec![]; 223 | 224 | let mut ipv4_pi = None; 225 | let mut ipv6_pi = None; 226 | for cmsg in msg.cmsgs() { 227 | match cmsg { 228 | ControlMessageOwned::Ipv4PacketInfo(pi) => { 229 | ipv4_pi = Some(pi); 230 | } 231 | ControlMessageOwned::Ipv6PacketInfo(pi) => { 232 | ipv6_pi = Some(pi); 233 | } 234 | x => panic!("Unknown control message {:?}", x), 235 | } 236 | } 237 | let pi4; 238 | if let Some(pi) = ipv4_pi { 239 | pi4 = pi; 240 | tx_cmsgs.push(ControlMessage::Ipv4PacketInfo(&pi4)); 241 | } 242 | let pi6; 243 | if let Some(pi) = ipv6_pi { 244 | pi6 = pi; 245 | tx_cmsgs.push(ControlMessage::Ipv6PacketInfo(&pi6)); 246 | } 247 | 248 | let so_cookie = get_so_cookie(fd.as_raw_fd()).unwrap(); 249 | 250 | let s = format!("{:#08x} ", so_cookie); 251 | let b = &msg.iovs().next().unwrap(); 252 | let iov = &[IoSlice::new(s.as_bytes()), IoSlice::new(b)]; 253 | 254 | let _ = nix::sys::socket::sendmsg( 255 | fd.as_raw_fd(), 256 | iov, 257 | &tx_cmsgs, 258 | nix::sys::socket::MsgFlags::empty(), 259 | msg.address.as_ref(), 260 | ); 261 | 262 | println!("{:?} {:?}", fd, msg); 263 | } 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /ebpf/ebpf_inter.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0-only 2 | /* 3 | * Part of the code adapted from bpf_dbg.c 4 | * 5 | * https://github.com/torvalds/linux/blob/master/tools/bpf/bpf_dbg.c 6 | * 7 | * Copyright 2013 Daniel Borkmann 8 | * Copyright (c) 2025 Cloudflare, Inc. 9 | * Licensed under the GNU General Public License Version 2 found in the ebpf/LICENSE file or at: 10 | * https://opensource.org/license/gpl-2-0 11 | */ 12 | 13 | #ifndef BPF_MOD 14 | #define BPF_MOD 0x90 15 | #endif 16 | #ifndef BPF_XOR 17 | #define BPF_XOR 0xa0 18 | #endif 19 | 20 | #define BPF_LDX_B (BPF_LDX | BPF_B) 21 | #define BPF_LDX_W (BPF_LDX | BPF_W) 22 | #define BPF_JMP_JA (BPF_JMP | BPF_JA) 23 | #define BPF_JMP_JEQ (BPF_JMP | BPF_JEQ) 24 | #define BPF_JMP_JGT (BPF_JMP | BPF_JGT) 25 | #define BPF_JMP_JGE (BPF_JMP | BPF_JGE) 26 | #define BPF_JMP_JSET (BPF_JMP | BPF_JSET) 27 | #define BPF_ALU_ADD (BPF_ALU | BPF_ADD) 28 | #define BPF_ALU_SUB (BPF_ALU | BPF_SUB) 29 | #define BPF_ALU_MUL (BPF_ALU | BPF_MUL) 30 | #define BPF_ALU_DIV (BPF_ALU | BPF_DIV) 31 | #define BPF_ALU_MOD (BPF_ALU | BPF_MOD) 32 | #define BPF_ALU_NEG (BPF_ALU | BPF_NEG) 33 | #define BPF_ALU_AND (BPF_ALU | BPF_AND) 34 | #define BPF_ALU_OR (BPF_ALU | BPF_OR) 35 | #define BPF_ALU_XOR (BPF_ALU | BPF_XOR) 36 | #define BPF_ALU_LSH (BPF_ALU | BPF_LSH) 37 | #define BPF_ALU_RSH (BPF_ALU | BPF_RSH) 38 | #define BPF_MISC_TAX (BPF_MISC | BPF_TAX) 39 | #define BPF_MISC_TXA (BPF_MISC | BPF_TXA) 40 | #define BPF_LD_B (BPF_LD | BPF_B) 41 | #define BPF_LD_H (BPF_LD | BPF_H) 42 | #define BPF_LD_W (BPF_LD | BPF_W) 43 | 44 | #define MEMWORDS_MASK (BPF_MEMWORDS - 1) 45 | 46 | struct bpf_regs { 47 | uint32_t R; 48 | uint32_t A; 49 | uint32_t X; 50 | uint32_t M[BPF_MEMWORDS]; 51 | }; 52 | 53 | struct interpret_ctx { 54 | struct sk_reuseport_md *md; 55 | struct reuseport_storage *state; 56 | int offset; 57 | 58 | struct bpf_regs r; 59 | int *retval; 60 | int retcode; 61 | 62 | uint32_t next_index; 63 | }; 64 | 65 | #define extract_u32(md, off) \ 66 | ({ \ 67 | uint32_t v; \ 68 | int r = bpf_skb_load_bytes(md, c->offset + off, &v, 4); \ 69 | if (r != 0) { \ 70 | c->retcode = IERR_LOAD; \ 71 | return LOOP_BREAK; \ 72 | } \ 73 | bpf_ntohl(v); \ 74 | }) 75 | 76 | #define extract_u16(md, off) \ 77 | ({ \ 78 | uint16_t v; \ 79 | int r = bpf_skb_load_bytes(md, c->offset + off, &v, 2); \ 80 | if (r != 0) { \ 81 | c->retcode = IERR_LOAD; \ 82 | return LOOP_BREAK; \ 83 | } \ 84 | bpf_ntohs(v); \ 85 | }) 86 | 87 | #define extract_u8(md, off) \ 88 | ({ \ 89 | uint8_t v; \ 90 | int r = bpf_skb_load_bytes(md, c->offset + off, &v, 1); \ 91 | if (r != 0) { \ 92 | c->retcode = IERR_LOAD; \ 93 | return LOOP_BREAK; \ 94 | } \ 95 | (v); \ 96 | }) 97 | 98 | /* log_printf("off=%d+%d b=%x\n", c->offset, off, v); \ */ 99 | 100 | /* Bytecode returns: 101 | -1 on PUSH/POP error (too long, too short stack) 102 | -2 failed to load bytes - too short packet? 103 | -3 finished without return opcode 104 | -4 unrecognized instruction 105 | 0 program finished, retval filled with the 3-byte packed thing 106 | */ 107 | enum { 108 | IERR_OK = 0, 109 | IERR_STACK = -1, 110 | IERR_LOAD = -2, 111 | IERR_INSTREXCEEDED = -3, 112 | IERR_BADINSTR = -4, 113 | IERR_SANITY = -5, 114 | IERR_BADRETURNVALUE = -6, 115 | }; 116 | 117 | #define LOOP_BREAK 1 118 | #define LOOP_CONTINUE 0 119 | static int _do_interpret_loop(uint32_t index, void *_ctx) 120 | { 121 | struct interpret_ctx *c = _ctx; 122 | 123 | /* Workaround against ebpf limitation where we can't 124 | * arbitrarily move index variable forward*/ 125 | if (index < c->next_index) 126 | return LOOP_CONTINUE; 127 | 128 | if (index >= MAX_INSTR) { 129 | c->retcode = IERR_SANITY; 130 | return LOOP_BREAK; 131 | } 132 | struct sock_filter *f = &c->state->dis.filter[index]; 133 | 134 | if (c->state->verbose > 0) 135 | log_printf("#%02d: (0x%02x, %d, %d, 0x%08x)\n", index, f->code, f->jt, 136 | f->jf, f->k); 137 | 138 | struct bpf_regs *r = &c->r; 139 | uint32_t K = f->k; 140 | 141 | // Consider supporting SKF_NET_LL as 142 | // bpf_skb_load_bytes_relative(BPF_HDR_START_MAC) and SKF_NET_OFF as 143 | // bpf_skb_load_bytes_relative(BPF_HDR_START_NET) 144 | 145 | uint32_t off; 146 | switch (f->code) { 147 | case BPF_MISC_TAX: 148 | r->X = r->A; 149 | break; 150 | case BPF_MISC_TXA: 151 | r->A = r->X; 152 | break; 153 | case BPF_ST: 154 | r->M[K & MEMWORDS_MASK] = r->A; 155 | break; 156 | case BPF_STX: 157 | r->M[K & MEMWORDS_MASK] = r->X; 158 | break; 159 | case BPF_LD_W | BPF_ABS: 160 | r->A = extract_u32(c->md, K); 161 | break; 162 | case BPF_LD_H | BPF_ABS: 163 | switch (K) { 164 | case SKF_AD_OFF + SKF_AD_PROTOCOL: 165 | // Did I get the endianness right? 166 | // md->eth_protocol is in network byte order, and we expect 167 | // this to be like 0x800, so host endianness. 168 | r->A = bpf_ntohs(c->md->eth_protocol); 169 | break; 170 | default: 171 | r->A = extract_u16(c->md, K); 172 | } 173 | break; 174 | case BPF_LD_B | BPF_ABS: 175 | r->A = extract_u8(c->md, K); 176 | 177 | break; 178 | case BPF_LD_W | BPF_IND: 179 | off = r->X + K; 180 | r->A = extract_u32(c->md, off); 181 | break; 182 | case BPF_LD_H | BPF_IND: 183 | off = r->X + K; 184 | r->A = extract_u16(c->md, off); 185 | break; 186 | case BPF_LD_B | BPF_IND: 187 | off = r->X + K; 188 | r->A = extract_u8(c->md, off); 189 | break; 190 | case BPF_LDX_B | BPF_MSH: 191 | r->X = extract_u8(c->md, K); 192 | r->X = (r->X & 0xf) << 2; 193 | break; 194 | case BPF_LD_W | BPF_LEN: 195 | case BPF_LDX_W | BPF_LEN: 196 | /* Total packet length minus the UDP payload offset: 197 | * UDP payload length */ 198 | r->A = c->md->len - c->offset; 199 | break; 200 | case BPF_LD | BPF_IMM: 201 | /* This is also {0,0,0,K} code point. */ 202 | r->A = K; 203 | break; 204 | case BPF_LDX | BPF_IMM: 205 | r->X = K; 206 | break; 207 | case BPF_LD | BPF_MEM: 208 | r->A = r->M[K & MEMWORDS_MASK]; 209 | break; 210 | case BPF_LDX | BPF_MEM: 211 | r->X = r->M[K & MEMWORDS_MASK]; 212 | break; 213 | case BPF_JMP_JA: 214 | c->next_index = 1 + index + K; 215 | break; 216 | case BPF_JMP_JGT | BPF_X: 217 | c->next_index = 1 + index + (r->A > r->X ? f->jt : f->jf); 218 | break; 219 | case BPF_JMP_JGT | BPF_K: 220 | c->next_index = 1 + index + (r->A > K ? f->jt : f->jf); 221 | break; 222 | case BPF_JMP_JGE | BPF_X: 223 | c->next_index = 1 + index + (r->A >= r->X ? f->jt : f->jf); 224 | break; 225 | case BPF_JMP_JGE | BPF_K: 226 | c->next_index = 1 + index + (r->A >= K ? f->jt : f->jf); 227 | break; 228 | case BPF_JMP_JEQ | BPF_X: 229 | c->next_index = 1 + index + (r->A == r->X ? f->jt : f->jf); 230 | break; 231 | case BPF_JMP_JEQ | BPF_K: 232 | c->next_index = 1 + index + (r->A == K ? f->jt : f->jf); 233 | break; 234 | case BPF_JMP_JSET | BPF_X: 235 | c->next_index = 1 + index + (r->A & r->X ? f->jt : f->jf); 236 | break; 237 | case BPF_JMP_JSET | BPF_K: 238 | c->next_index = 1 + index + (r->A & K ? f->jt : f->jf); 239 | break; 240 | case BPF_ALU_NEG: 241 | r->A = -r->A; 242 | break; 243 | case BPF_ALU_LSH | BPF_X: 244 | r->A <<= r->X; 245 | break; 246 | case BPF_ALU_LSH | BPF_K: 247 | r->A <<= K; 248 | break; 249 | case BPF_ALU_RSH | BPF_X: 250 | r->A >>= r->X; 251 | break; 252 | case BPF_ALU_RSH | BPF_K: 253 | r->A >>= K; 254 | break; 255 | case BPF_ALU_ADD | BPF_X: 256 | r->A += r->X; 257 | break; 258 | case BPF_ALU_ADD | BPF_K: 259 | r->A += K; 260 | break; 261 | case BPF_ALU_SUB | BPF_X: 262 | r->A -= r->X; 263 | break; 264 | case BPF_ALU_SUB | BPF_K: 265 | r->A -= K; 266 | break; 267 | case BPF_ALU_MUL | BPF_X: 268 | r->A *= r->X; 269 | break; 270 | case BPF_ALU_MUL | BPF_K: 271 | r->A *= K; 272 | break; 273 | case BPF_ALU_DIV | BPF_X: 274 | case BPF_ALU_MOD | BPF_X: 275 | if (r->X == 0) { 276 | c->retcode = IERR_BADINSTR; 277 | return LOOP_BREAK; 278 | } 279 | goto do_div; 280 | case BPF_ALU_DIV | BPF_K: 281 | case BPF_ALU_MOD | BPF_K: 282 | if (K == 0) { 283 | c->retcode = IERR_BADINSTR; 284 | return LOOP_BREAK; 285 | } 286 | do_div: 287 | switch (f->code) { 288 | case BPF_ALU_DIV | BPF_X: 289 | r->A /= r->X; 290 | break; 291 | case BPF_ALU_DIV | BPF_K: 292 | r->A /= K; 293 | break; 294 | case BPF_ALU_MOD | BPF_X: 295 | r->A %= r->X; 296 | break; 297 | case BPF_ALU_MOD | BPF_K: 298 | r->A %= K; 299 | break; 300 | } 301 | break; 302 | case BPF_ALU_AND | BPF_X: 303 | r->A &= r->X; 304 | break; 305 | case BPF_ALU_AND | BPF_K: 306 | r->A &= K; 307 | break; 308 | case BPF_ALU_OR | BPF_X: 309 | r->A |= r->X; 310 | break; 311 | case BPF_ALU_OR | BPF_K: 312 | r->A |= K; 313 | break; 314 | case BPF_ALU_XOR | BPF_X: 315 | r->A ^= r->X; 316 | break; 317 | case BPF_ALU_XOR | BPF_K: 318 | r->A ^= K; 319 | break; 320 | 321 | case BPF_RET | BPF_K: 322 | c->retcode = IERR_OK; 323 | *c->retval = K; 324 | return LOOP_BREAK; 325 | case BPF_RET | BPF_X: 326 | c->retcode = IERR_OK; 327 | *c->retval = r->X; 328 | return LOOP_BREAK; 329 | case BPF_RET | BPF_A: 330 | c->retcode = IERR_OK; 331 | *c->retval = r->A; 332 | return LOOP_BREAK; 333 | default: 334 | c->retcode = IERR_BADINSTR; 335 | return LOOP_BREAK; 336 | } 337 | return LOOP_CONTINUE; 338 | } 339 | 340 | static int interpret_cbpf(struct sk_reuseport_md *md, struct reuseport_storage *state, 341 | int *retval) 342 | { 343 | struct interpret_ctx ctx = { 344 | .md = md, 345 | .state = state, 346 | .retval = retval, 347 | .retcode = IERR_INSTREXCEEDED, // exceeded instructions 348 | .offset = sizeof(struct udphdr), 349 | }; 350 | bpf_loop(state->dis.filter_len, _do_interpret_loop, &ctx, 0); 351 | return ctx.retcode; 352 | } 353 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /tools/udpgrm_activate.py: -------------------------------------------------------------------------------- 1 | #!/bin/env -S python3 -u 2 | 3 | # Copyright (c) 2025 Cloudflare, Inc. 4 | # Licensed under the Apache 2.0 license found in the LICENSE file or at: 5 | # https://opensource.org/licenses/Apache-2.0 6 | 7 | from systemd.daemon import notify 8 | import argparse 9 | import errno 10 | import ipaddress 11 | import re 12 | import socket 13 | import struct 14 | import sys 15 | import time 16 | 17 | UDP_GRM_WORKING_GEN = 200 18 | UDP_GRM_SOCKET_GEN = 201 19 | UDP_GRM_DISSECTOR = 202 20 | UDP_GRM_FLOW_ASSURE = 203 21 | UDP_GRM_SOCKET_APP = 204 22 | 23 | DISSECTOR_FLOW = 0 24 | DISSECTOR_CBPF = 1 25 | DISSECTOR_BESPOKE = 3 26 | DISSECTOR_NOOP = 4 27 | DISSECTOR_FLAG_VERBOSE = 0x8000 28 | DISSECTOR_FLAG_READY = 0x10000 29 | DISSECTOR_FLAG_FAILED = 0x20000 30 | 31 | IP_FREEBIND = 15 32 | IPV6_FREEBIND = 78 33 | 34 | parser = argparse.ArgumentParser( 35 | prog='activate', 36 | description='Create unconnected UDP sockets and put them in systemd file descriptor store') 37 | 38 | parser.add_argument('-4', '--also-ipv4', action='store_true', 39 | help='Clear IPV6_V6ONLY flag. Allow IPv4 traffic on IPv6 socket.') 40 | parser.add_argument('-c', '--count', default=1, type=int, 41 | help='REUSEPORT group size - how many sockets to create') 42 | parser.add_argument('--freebind', action='store_true', 43 | help='set IP_FREEBIND / IPV6_FREEBIND') 44 | parser.add_argument('--rcvbuf', default=16777216, type=int, 45 | help='set SO_RCVBUF to use non-default receive buffer') 46 | parser.add_argument('name', help='Systemd file descriptor name / FDNAME') 47 | parser.add_argument('address', 48 | help='Address and port to bind to (like: 127.0.0.1:443 or [::1]:443)') 49 | parser.add_argument('-v', '--verbose', action='store_true', 50 | help='Verbose output logging.') 51 | parser.add_argument('--no-register', action='store_true', 52 | help='Skip socket registration step. We register sockets by default as work_gen+1 generation.') 53 | parser.add_argument('--advance-working-gen', action='store_true', 54 | help='Advance working generation after registration is done. Not recomended. Should be done by the application.') 55 | 56 | bpfparser = parser.add_argument_group('udpgrm cBPF related options') 57 | bpfparser.add_argument('-b', '--bpf', dest='CBPFFILE', 58 | help='Load cBPF from given file. Expecting format like from \'bpf_asm -c\'.') 59 | bpfparser.add_argument('-a', '--app', dest='APPNO', type=int, 60 | help='Application number') 61 | bpfparser.add_argument('-m', '--apps-max', default=0, type=int, 62 | help='Max application count') 63 | bpfparser.add_argument('-t', '--tubular', default="", help='Tubular label') 64 | bpfparser.add_argument('-f', '--flow-timeout', default=124, 65 | type=int, help='For FLOW dissector, the flow timeout') 66 | bpfparser.add_argument('-s', '--sni', dest='QUICHOSTNAME', action='append', 67 | help='Parse QUIC hostname in format hostname:appnumber.') 68 | bpfparser.add_argument('-d', '--digest', type=lambda x: int(x, 0), default=0, 69 | help='select digest') 70 | bpfparser.add_argument('-n', '--noop', action='store_true', 71 | help='Use built-in NOOP dissector') 72 | 73 | if '--' in sys.argv: 74 | idx = sys.argv.index('--') 75 | prog_args = sys.argv[1:idx] 76 | cmd = sys.argv[idx+1:] 77 | else: 78 | prog_args = sys.argv[1:] 79 | cmd = [] 80 | 81 | args = parser.parse_args(prog_args) 82 | 83 | if args.APPNO is not None and (not args.digest and args.CBPFFILE is None and args.QUICHOSTNAME is None): 84 | print("[!] You need to select --bpf, --digest or --sni before doing --app") 85 | sys.exit(1) 86 | 87 | cbpf = [] 88 | if args.CBPFFILE: 89 | r = re.compile( 90 | r'^\w*{\s+([x0-9a-f]+),\s+([\d]+),\s+([\d]+),\s+([x0-9a-f]+)\s+},\w*$') 91 | for line in filter(lambda l: l and l[0] == '{', open(args.CBPFFILE)): 92 | m = r.match(line.strip()) 93 | if not m: 94 | print("[!] Bad line %r" % (line, )) 95 | sys.exit(1) 96 | cbpf.append(tuple(int(t, 0) for t in m.groups())) 97 | if args.verbose: 98 | print("[ ] Loaded %d cBPF instructions from %r" % 99 | (len(cbpf), args.CBPFFILE)) 100 | 101 | quic_hostnames = [] 102 | if args.QUICHOSTNAME: 103 | quic_hostnames = [h.split(":") for h in args.QUICHOSTNAME] 104 | 105 | 106 | def addr_to_str(addr): 107 | ip_s = addr[0] if ':' not in addr[0] else "[%s]" % (addr[0],) 108 | return "%s:%d" % (ip_s, addr[1]) 109 | 110 | 111 | def pack_hostname(host): 112 | hostname = host[0] 113 | app = int(host[1]) if len(host) > 1 else 0 114 | return struct.pack('BB', app, len(hostname)) + bytes(hostname, "utf-8") 115 | 116 | 117 | def retrying_setsockopt(sd, level, optname, value): 118 | for i in range(8): 119 | try: 120 | return sd.setsockopt(level, optname, value) 121 | except OSError as e: 122 | if e.args[0] == errno.EAGAIN: 123 | # exponential backoff 124 | time.sleep(0.01 * (2 ** i)) 125 | else: 126 | raise 127 | 128 | 129 | def main(args): 130 | SOCKETS = [] 131 | 132 | ip, separator, port = args.address.rpartition(':') 133 | port = int(port) 134 | ip = ipaddress.ip_address(ip.strip("[]")) 135 | family = socket.AF_INET if ip.version == 4 else socket.AF_INET6 136 | 137 | addr = (str(ip), port) 138 | for i in range(args.count): 139 | sd = socket.socket(family, socket.SOCK_DGRAM) 140 | if family == socket.AF_INET6: 141 | sd.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, 142 | 0 if args.also_ipv4 else 1) 143 | 144 | if args.freebind: 145 | if family == socket.AF_INET: 146 | sd.setsockopt(socket.IPPROTO_IP, IP_FREEBIND, 1) 147 | else: 148 | sd.setsockopt(socket.IPPROTO_IPV6, IPV6_FREEBIND, 1) 149 | 150 | sd.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) 151 | 152 | if args.rcvbuf != 0: 153 | sd.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, args.rcvbuf) 154 | 155 | sd.bind(addr) 156 | if i == 0 and addr[1] == 0: 157 | addr = sd.getsockname() 158 | 159 | D_FLAGS = 0 160 | if args.verbose: 161 | D_FLAGS = DISSECTOR_FLAG_VERBOSE 162 | if i == 0 and (cbpf or args.digest or args.tubular or quic_hostnames or args.noop): 163 | if cbpf: 164 | v = struct.pack("IIII100sI512s", DISSECTOR_CBPF | D_FLAGS, 165 | 0, args.apps_max, 0, bytes( 166 | args.tubular, 'utf-8'), 167 | len(cbpf), b''.join(struct.pack('HBBI', *sf) for sf in cbpf)) 168 | elif args.digest: 169 | v = struct.pack("IIII100sI512s", DISSECTOR_BESPOKE | D_FLAGS, 170 | 0, args.apps_max, args.digest, bytes( 171 | args.tubular, 'utf-8'), 172 | len(quic_hostnames), b''.join( 173 | struct.pack('BB62s', int(a), 0, 174 | bytes(h, "utf-8")) 175 | for h, a in quic_hostnames)) 176 | elif args.noop: 177 | v = struct.pack("IIII100s", DISSECTOR_NOOP | D_FLAGS, 178 | 0, 0, 0, bytes(args.tubular, 'utf-8')) 179 | elif args.tubular: 180 | v = struct.pack("IIII100s", DISSECTOR_FLOW | D_FLAGS, 181 | args.flow_timeout, 0, 0, bytes( 182 | args.tubular, 'utf-8')) 183 | try: 184 | retrying_setsockopt(sd, socket.IPPROTO_UDP, 185 | UDP_GRM_DISSECTOR, v) 186 | except OSError as e: 187 | if e.errno == 1: 188 | print("[!] setsockopt(UDP_GRM_DISSECTOR) failed. Dissector conflict? Try 'udpgrm delete %s'." % ( 189 | args.address,)) 190 | else: 191 | print( 192 | "[!] setsockopt(UDP_GRM_DISSECTOR) failed. is udpgrm loaded? Try 'udpgrm --self --install'. (errno=%d)" % (e.errno,)) 193 | sys.exit(1) 194 | 195 | 196 | if args.APPNO or args.apps_max: 197 | try: 198 | retrying_setsockopt(sd, socket.IPPROTO_UDP, 199 | UDP_GRM_SOCKET_APP, args.APPNO or 0) 200 | except OSError as e: 201 | if e.errno == 75: 202 | print("[!] setsockopt(UDP_GRM_SOCKET_APP) failed. Perhaps conflict with APPMAX? (errno=%d)" % ( 203 | e.errno,)) 204 | else: 205 | print("[!] setsockopt(UDP_GRM_SOCKET_APP) failed. is udpgrm loaded? (errno=%d)" % ( 206 | e.errno,)) 207 | sys.exit(1) 208 | 209 | SOCKETS.append(sd) 210 | 211 | if args.verbose: 212 | print("[ ] FDNAME=%s deleting old entry from fd store" % (args.name,)) 213 | notify("FDSTOREREMOVE=1\nFDNAME=%s" % (args.name,)) 214 | 215 | if args.verbose: 216 | print("[ ] FDNAME=%s adding %d UDP %s sockets to fd store" % 217 | (args.name, args.count, addr_to_str(addr))) 218 | notify("FDSTORE=1\nFDNAME=%s" % (args.name, ), 219 | fds=[fd.fileno() for fd in SOCKETS]) 220 | 221 | if not args.no_register: 222 | # Socket pre-registration, to avoid the service having to perform this dance itself. 223 | # Get the current working generation so we can set the correct number for the next generation 224 | try: 225 | working_gen = struct.unpack("i", SOCKETS[0].getsockopt( 226 | socket.IPPROTO_UDP, UDP_GRM_WORKING_GEN, 4))[0] 227 | except OSError: 228 | print( 229 | "[!] getsockopt(UDP_GRM_WORKING_GEN) failed. is udpgrm still running?") 230 | sys.exit(1) 231 | 232 | # Set the number of the new generation of sockets. This triggers a socket register 233 | # message in the udpgrm daemon. 234 | for fd in SOCKETS: 235 | try: 236 | retrying_setsockopt(fd, socket.IPPROTO_UDP, 237 | UDP_GRM_SOCKET_GEN, (working_gen + 1)) 238 | except OSError: 239 | print( 240 | "[!] setsockopt(UDP_GRM_SOCKET_GEN) failed. is udpgrm still running?") 241 | sys.exit(1) 242 | 243 | # Wait until socket registration is complete 244 | for fd in SOCKETS: 245 | for i in range(10): 246 | v = fd.getsockopt(socket.IPPROTO_UDP, UDP_GRM_SOCKET_GEN, 8) 247 | sk_gen, sk_idx = struct.unpack('II', v) 248 | if sk_idx != 0xffffffff: 249 | # valid registration found 250 | break 251 | if i >= 7: 252 | print("[!] pre-registration failed. is udpgrm still running?") 253 | sys.exit(1) 254 | # exponential backoff 255 | time.sleep(0.1 * (2 ** i)) 256 | 257 | # Service can now call setsockopt(UDP_GRM_WORKING_GEN) to steer new connections towards these sockets 258 | if args.advance_working_gen: 259 | try: 260 | retrying_setsockopt(fd, socket.IPPROTO_UDP, 261 | UDP_GRM_WORKING_GEN, working_gen + 1) 262 | except OSError: 263 | print( 264 | "[!] setsockopt(UDP_GRM_SOCKET_GEN) failed. is udpgrm still running?") 265 | sys.exit(1) 266 | return SOCKETS 267 | 268 | 269 | def clear_cloexec(fd): 270 | fd = fd.fileno() 271 | flags = fcntl.fcntl(fd, fcntl.F_GETFD) 272 | fcntl.fcntl(fd, fcntl.F_SETFD, flags & ~fcntl.FD_CLOEXEC) 273 | 274 | 275 | if __name__ == '__main__': 276 | sockets = main(args) 277 | if cmd: 278 | # only lazy-import here, not on a hot path 279 | import fcntl 280 | import os 281 | import shutil 282 | 283 | for sd in sockets: 284 | clear_cloexec(sd) 285 | os.execve(shutil.which(cmd[0]), cmd, os.environ) 286 | --------------------------------------------------------------------------------