├── .gitignore
├── Makefile
├── deploy-in-ns.sh
├── test.sh
├── README.md
└── tcp_in_udp_tc.c


/.gitignore:
--------------------------------------------------------------------------------
1 | /compile_commands.json
2 | *.o
3 | *.pcap
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS = -O2 -g -Wall -target bpf # -Werror
 2 | CC = clang
 3 | 
 4 | all: tcp_in_udp_tc.o
 5 | .PHONY: all
 6 | 
 7 | %.o: %.c
 8 | 	${CC} ${CFLAGS} -c $^ -o $@ -MJ compile_commands.json
 9 | 
10 | clean:
11 | 	rm -f *.o
12 | 


--------------------------------------------------------------------------------
/deploy-in-ns.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # SPDX-License-Identifier: GPL-2.0
 3 | # Copyright (c) 2025, Matthieu Baerts.
 4 | 
 5 | export NS=tcp
 6 | 
 7 | cleanup()
 8 | {
 9 | 	return
10 | 
11 | 	local suffix
12 | 	ip netns pids "${NS}" | xargs -r kill
13 | 	ip netns del "${NS}" >/dev/null 2>&1
14 | }
15 | 
16 | trap cleanup EXIT
17 | 
18 | server()
19 | {
20 | 	ip netns exec "${NS}" iperf3 -s -D
21 | 	sleep 1 # making sure the daemon is launched
22 | }
23 | 
24 | tc_setup()
25 | {
26 | 	local ns="${NS}" iface="nshost"
27 | 
28 | 	# ip netns will umount everything on exit
29 | 	ip netns exec "${ns}" sh -c "mount -t debugfs none /sys/kernel/debug && cat /sys/kernel/debug/tracing/trace_pipe" &
30 | 
31 | 	tc -n "${ns}" qdisc add dev "${iface}" clsact
32 | 	tc -n "${ns}" filter add dev "${iface}" egress  bpf da obj tcp_in_udp_tc.o sec tc_egress
33 | 	tc -n "${ns}" filter add dev "${iface}" ingress bpf da obj tcp_in_udp_tc.o sec tc_ingress
34 | 
35 | 	tc -n "${ns}" filter show dev "${iface}" egress
36 | 	tc -n "${ns}" filter show dev "${iface}" ingress
37 | 
38 | 	ip netns exec "${ns}" ethtool -K "${iface}" gro off gso off tso off lro off ufo off sg off
39 | 	ethtool -K "eth0" gro off gso off tso off lro off ufo off sg off
40 | }
41 | 
42 | capture()
43 | {
44 | 	ip netns exec "${NS}" tcpdump -i nshost -s 100 -w ns.pcap tcp or udp
45 | }
46 | 
47 | setup()
48 | {
49 | 	ip netns add "${NS}"
50 | 	ip -n "${NS}" link set lo up
51 | 
52 | 	ip link add hostns type veth peer name nshost
53 | 	ip link set nshost netns "${NS}"
54 | 
55 | 	ip link set hostns up
56 | 	ip -n "${NS}" link set nshost up
57 | 
58 | 	ip addr add 10.0.42.1/24 dev hostns
59 | 	ip -n "${NS}" addr add 10.0.42.2/24 dev nshost
60 | 
61 | 	ip -n "${NS}" route add default via 10.0.42.1 dev nshost
62 | 
63 | 	# TODO: forward port 5201 + masquerade
64 | }
65 | 
66 | setup
67 | server
68 | # capture
69 | 
70 | tc_setup
71 | 
72 | case "${1}" in
73 | 	*)
74 | 		export -f capture
75 | 		ip netns exec ${NS}_cli sh -c "mount -t debugfs none /sys/kernel/debug && bash"
76 | 		;;
77 | esac
78 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -e
  2 | # SPDX-License-Identifier: GPL-2.0
  3 | # Copyright (c) 2025, Matthieu Baerts.
  4 | 
  5 | export NS=tcp
  6 | export HOSTS=(cli cpe int net srv)
  7 | 
  8 | netns()
  9 | {
 10 | 	local suffix
 11 | 	local nss=()
 12 | 	for suffix in "${HOSTS[@]}"; do
 13 | 		nss+=("${NS}_${suffix}")
 14 | 	done
 15 | 	echo "${nss[@]}"
 16 | }
 17 | 
 18 | cleanup()
 19 | {
 20 | 	tc -n "${NS}_cpe" -s action show action csum
 21 | 	tc -n "${NS}_net" -s action show action csum
 22 | 
 23 | 	local suffix
 24 | 	for suffix in "${HOSTS[@]}"; do
 25 | 		local ns="${NS}_${suffix}"
 26 | 		echo "== ${suffix} =="
 27 | 		ip netns exec "${ns}" nstat
 28 | 		ip netns pids "${ns}" | xargs -r kill
 29 | 		ip netns del "${ns}" >/dev/null 2>&1
 30 | 	done
 31 | }
 32 | 
 33 | trap cleanup EXIT
 34 | 
 35 | server()
 36 | {
 37 | 	ip netns exec "${NS}_srv" iperf3 -s -D
 38 | 	sleep .1 # making sure the daemon is launched
 39 | }
 40 | 
 41 | tc_client()
 42 | {
 43 | 	local ns="${NS}_cpe" iface="int" port="5201"
 44 | 
 45 | 	# ip netns will umount everything on exit
 46 | 	ip netns exec "${ns}" sh -c "mount -t debugfs none /sys/kernel/debug && cat /sys/kernel/debug/tracing/trace_pipe" &
 47 | 
 48 | 	tc -n "${ns}" qdisc add dev "${iface}" clsact
 49 | 	tc -n "${ns}" filter add dev "${iface}" egress  u32 match tcp dst "${port}" 0xffff action goto chain 1
 50 | 	tc -n "${ns}" filter add dev "${iface}" egress  chain 1 bpf object-file tcp_in_udp_tc.o section tc action csum udp
 51 | 	tc -n "${ns}" filter add dev "${iface}" ingress u32 match udp src "${port}" 0xffff action goto chain 1
 52 | 	tc -n "${ns}" filter add dev "${iface}" ingress chain 1 bpf object-file tcp_in_udp_tc.o section tc direct-action
 53 | 
 54 | 	tc -n "${ns}" filter show dev "${iface}" egress
 55 | 	tc -n "${ns}" filter show dev "${iface}" ingress
 56 | 
 57 | 	ip -n "${NS}_cli" link set "cpe" gso_max_segs 0
 58 | }
 59 | 
 60 | tc_server()
 61 | {
 62 | 	local ns="${NS}_net" iface="int" port="5201"
 63 | 
 64 | 	# ip netns will umount everything on exit
 65 | 	ip netns exec "${ns}" sh -c "mount -t debugfs none /sys/kernel/debug && cat /sys/kernel/debug/tracing/trace_pipe" &
 66 | 
 67 | 	tc -n "${ns}" qdisc add dev "${iface}" clsact
 68 | 	tc -n "${ns}" filter add dev "${iface}" egress  u32 match tcp src "${port}" 0xffff action goto chain 1
 69 | 	tc -n "${ns}" filter add dev "${iface}" egress  chain 1 bpf object-file tcp_in_udp_tc.o section tc action csum udp
 70 | 	tc -n "${ns}" filter add dev "${iface}" ingress u32 match udp dst "${port}" 0xffff action goto chain 1
 71 | 	tc -n "${ns}" filter add dev "${iface}" ingress chain 1 bpf object-file tcp_in_udp_tc.o section tc direct-action
 72 | 
 73 | 	tc -n "${ns}" filter show dev "${iface}" egress
 74 | 	tc -n "${ns}" filter show dev "${iface}" ingress
 75 | 
 76 | 	ip -n "${NS}_srv" link set "net" gso_max_segs 0
 77 | }
 78 | 
 79 | capture()
 80 | {
 81 | 	ip netns exec "${NS}_cli" tcpdump -i cpe -s 100 -w cli_cpe.pcap tcp or udp &
 82 | 	ip netns exec "${NS}_int" tcpdump -i cpe -s 100 -w int_cpe.pcap tcp or udp &
 83 | 	ip netns exec "${NS}_int" tcpdump -i net -s 100 -w int_net.pcap tcp or udp &
 84 | 	ip netns exec "${NS}_srv" tcpdump -i net -s 100 -w srv_net.pcap tcp or udp &
 85 | }
 86 | 
 87 | setup()
 88 | {
 89 | 	local suffix
 90 | 	for suffix in "${HOSTS[@]}"; do
 91 | 		local ns="${NS}_${suffix}"
 92 | 		ip netns add "${ns}"
 93 | 		ip -n "${ns}" link set lo up
 94 | 	done
 95 | 
 96 | 	#        .0.2  .0.1   .1.2  .1.1   .3.2  .3.1   .2.1  .2.2
 97 | 	#     cli -------- cpe -------- int -------- net -------- srv
 98 | 
 99 | 	ip link add "cli" netns "${NS}_cpe" type veth peer name "cpe" netns "${NS}_cli"
100 | 	ip link add "cpe" netns "${NS}_int" type veth peer name "int" netns "${NS}_cpe"
101 | 	ip link add "int" netns "${NS}_net" type veth peer name "net" netns "${NS}_int"
102 | 	ip link add "net" netns "${NS}_srv" type veth peer name "srv" netns "${NS}_net"
103 | 
104 | 	ip -n "${NS}_cli" link set "cpe" up
105 | 	ip -n "${NS}_cli" addr add dev "cpe" 10.0.0.2/24
106 | 	ip -n "${NS}_cli" route add default via 10.0.0.1 dev "cpe"
107 | 
108 | 	ip -n "${NS}_cpe" link set "cli" up
109 | 	ip -n "${NS}_cpe" addr add dev "cli" 10.0.0.1/24
110 | 	ip -n "${NS}_cpe" link set "int" up
111 | 	ip -n "${NS}_cpe" addr add dev "int" 10.0.1.2/24
112 | 	ip -n "${NS}_cpe" route add default via 10.0.1.1 dev "int"
113 | 
114 | 	ip -n "${NS}_int" link set "cpe" up
115 | 	ip -n "${NS}_int" addr add dev "cpe" 10.0.1.1/24
116 | 	#tc -n "${NS}_int" qdisc add dev "cpe" root netem rate 10mbit delay 5ms
117 | 	ip -n "${NS}_int" link set "net" up
118 | 	ip -n "${NS}_int" addr add dev "net" 10.0.3.2/24
119 | 	#tc -n "${NS}_int" qdisc add dev "net" root netem rate 10mbit delay 5ms
120 | 	ip -n "${NS}_int" route add 10.0.0.0/24 via 10.0.1.2 dev "cpe"
121 | 	ip -n "${NS}_int" route add 10.0.2.0/24 via 10.0.3.1 dev "net"
122 | 
123 | 	ip -n "${NS}_net" link set "int" up
124 | 	ip -n "${NS}_net" addr add dev "int" 10.0.3.1/24
125 | 	ip -n "${NS}_net" link set "srv" up
126 | 	ip -n "${NS}_net" addr add dev "srv" 10.0.2.1/24
127 | 	ip -n "${NS}_net" route add default via 10.0.3.2 dev "int"
128 | 
129 | 	ip -n "${NS}_srv" link set "net" up
130 | 	ip -n "${NS}_srv" addr add dev "net" 10.0.2.2/24
131 | 	ip -n "${NS}_srv" route add default via 10.0.2.1 dev "net"
132 | }
133 | 
134 | setup
135 | server
136 | # capture
137 | 
138 | tc_client
139 | tc_server
140 | 
141 | case "${1}" in
142 | 	*)
143 | 		export -f tc_client tc_server
144 | 		echo -e "\n\tNetns: $(netns)\n\tUse 'ip netns exec <NETNS> <CMD>' to execute a command in the netns.\n\tServer: iperf3 -c 10.0.2.2 -R\n"
145 | 		PS1="client# " ip netns exec ${NS}_cli sh -c "mount -t debugfs none /sys/kernel/debug && bash"
146 | 		;;
147 | esac
148 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TCP in UDP
  2 | 
  3 | Middleboxes can mess up with TCP flows, e.g. intercepting the connections and
  4 | dropping MPTCP options. Using an TCP-in-UDP tunnel will force such middleboxes
  5 | not to modify such TCP connections. The idea here is inspired by an old [IETF
  6 | draft](https://datatracker.ietf.org/doc/html/draft-cheshire-tcp-over-udp-00.html).
  7 | 
  8 | This "tunnel" is done in eBPF, from the TC hooks. For more details about why it
  9 | has been created, and its particularities, please check this
 10 | [blog post](https://blog.mptcp.dev/2025/07/14/TCP-in-UDP.html).
 11 | 
 12 | ## Headers
 13 | 
 14 | [UDP](https://www.ietf.org/rfc/rfc768.html):
 15 | 
 16 | ```
 17 |  0                   1                   2                   3
 18 |  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 19 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 20 | |          Source Port          |       Destination Port        |
 21 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 22 | |            Length             |           Checksum            |
 23 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 24 | ```
 25 | 
 26 | [TCP](https://www.ietf.org/rfc/rfc9293.html):
 27 | 
 28 | ```
 29 |  0                   1                   2                   3
 30 |  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 31 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 32 | |          Source Port          |       Destination Port        |
 33 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 34 | |                        Sequence Number                        |
 35 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 36 | |                    Acknowledgment Number                      |
 37 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 38 | |  Data |       |C|E|U|A|P|R|S|F|                               |
 39 | | Offset| Reser |R|C|R|C|S|S|Y|I|            Window             |
 40 | |       |       |W|E|G|K|H|T|N|N|                               |
 41 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 42 | |           Checksum            |         Urgent Pointer        |
 43 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 44 | |                      (Optional) Options                       |
 45 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 46 | ```
 47 | 
 48 | [TCP-in-UDP](https://datatracker.ietf.org/doc/html/draft-cheshire-tcp-over-udp-00.html):
 49 | 
 50 | ```
 51 |  0                   1                   2                   3
 52 |  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 53 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 54 | |          Source Port          |       Destination Port        |
 55 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 56 | |            Length             |           Checksum            |
 57 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 58 | |  Data |       |C|E| |A|P|R|S|F|                               |
 59 | | Offset| Reser |R|C|0|C|S|S|Y|I|            Window             |
 60 | |       |       |W|E| |K|H|T|N|N|                               |
 61 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 62 | |                        Sequence Number                        |
 63 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 64 | |                    Acknowledgment Number                      |
 65 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 66 | |                      (Optional) Options                       |
 67 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 68 | ```
 69 | 
 70 | Modifications:
 71 | - `URG` set to 0, `Urgent Pointer` is supposed to be zero (not used).
 72 | - Switch `Sequence Number` and `Acknowledgment Number` with `Urgent Pointer` and
 73 |   `Checksum`.
 74 | - Replace `Urgent Pointer` by the `Length`: Checksum needs to be recomputed.
 75 | 
 76 | Checksum:
 77 | - No need to recompute it from scratch, it can be derived from the previous
 78 |   values, by just changing the protocol.
 79 | 
 80 | - [UDP Checksum](https://www.rfc-editor.org/rfc/rfc768) computed from:
 81 |   - Source and destination address: from upper layer
 82 |   - Protocol (1B): UDP (17)
 83 |   - Length (2B): Data (variable) + UDP header (8 octets) lengths
 84 |   - TCP header
 85 |   - Data
 86 | 
 87 | - [TCP Checksum](https://www.ietf.org/rfc/rfc9293.html#section-3.1-6.18.1)
 88 |   computed from:
 89 |   - Source and destination address: from upper layer
 90 |   - Protocol (1B): TCP (6)
 91 |   - Length (2B): Data (variable) + TCP header (Between 20 and 56 octets) lengths
 92 |   - TCP header
 93 |   - Data
 94 | 
 95 | - Differences:
 96 |   - Source and destination address: not changed
 97 |   - Protocol: **changed**: UDP/TCP.
 98 |   - Data length: not changed
 99 |   - L4 header: **changed**: `UDP Length` vs `TCP Urgent Pointer`
100 |   - Data: not changed
101 | 
102 | 
103 | ## Build
104 | 
105 | Build the binary using `make`. CLang, `libelf`, `libc6`, and `libbpf` are
106 | required:
107 | 
108 | ```
109 | sudo apt install make clang libelf-dev libc6-dev-i386 libbpf-dev
110 | ```
111 | 
112 | 
113 | ## Setup
114 | 
115 | Load it with `tc` commands:
116 | 
117 | - Client:
118 |   ```
119 |   tc qdisc add dev "${IFACE}" clsact
120 |   tc filter add dev "${IFACE}" egress  u32 match ip dport "${PORT}" 0xffff action goto chain 1
121 |   tc filter add dev "${IFACE}" egress  chain 1 bpf object-file tcp_in_udp_tc.o section tc action csum udp
122 |   tc filter add dev "${IFACE}" ingress u32 match ip sport "${PORT}" 0xffff action goto chain 1
123 |   tc filter add dev "${IFACE}" ingress chain 1 bpf object-file tcp_in_udp_tc.o section tc direct-action
124 |   ```
125 | - Server:
126 |   ```
127 |   tc qdisc add dev "${IFACE}" clsact
128 |   tc filter add dev "${IFACE}" egress  u32 match ip sport "${PORT}" 0xffff action goto chain 1
129 |   tc filter add dev "${IFACE}" egress  chain 1 bpf object-file tcp_in_udp_tc.o section tc action csum udp
130 |   tc filter add dev "${IFACE}" ingress u32 match ip dport "${PORT}" 0xffff action goto chain 1
131 |   tc filter add dev "${IFACE}" ingress chain 1 bpf object-file tcp_in_udp_tc.o section tc direct-action
132 |   ```
133 | 
134 | On layer 3 interfaces, use the ELF section called `tc_l3`.
135 | 
136 | Multiple u32 filters can be used to have more than one port traffic sent to the
137 | BPF program.
138 | 
139 | If the TCP program supports setting marks (`SO_MARK`), use it for egress to
140 | prevent processing traffic that is not from the TCP program. For client, this
141 | allows traffic to a different IP address with the same TCP port. For server,
142 | this prevents sending packet to BPF program if the interface has multiple IP
143 | addresses assigned and if the TCP program doesn't bind to all of them.
144 | 
145 | - Client & Server:
146 |   ```
147 |   tc filter add dev "${IFACE}" egress  handle 2 fw action goto chain 1
148 |   ```
149 | 
150 | Be warned that `SO_MARK` can't be used for ingress as the system doesn't expect
151 | incoming UDP packets. Therefore, all incoming packets from the interface with
152 | matching port will be sent to the BPF program. To decrease the chance of this
153 | happening, you're recommended to use ports that are outside the ephemeral port
154 | range set on `net.ipv4.ip_local_port_range` (default: 32768-60999). This option
155 | applies to IPv6 too.
156 | 
157 | Generic Segmentation Offload (GSO) and Generic Receive Offload (GRO) cannot be
158 | used for this traffic, because each UDP packet will carry a part of the TCP
159 | headers as part of the data. This part of the data is specific to one packet,
160 | therefore, it cannot be merged with the next data. UDP GRO is only done on
161 | demand, e.g. when the userspace asks it (`setsockopt(IPPROTO_UDP, UDP_GRO)`) or
162 | for some in-kernel tunnels, so GRO doesn't need to be disabled. To disable GSO:
163 | 
164 | ```
165 | ip link set ${IFACE} gso_max_segs 0
166 | ```
167 | 
168 | Note: to get some stats, in egress, it is possible to use:
169 | 
170 | ```
171 | tc -s action show action csum
172 | tc -s -j action show action csum | jq
173 | ```
174 | 
175 | It might be interesting to monitor the tracing ring buffer for warnings and
176 | other messages generated by the eBPF program:
177 | 
178 | ```
179 | cat /sys/kernel/debug/tracing/trace_pipe
180 | ```
181 | 
182 | To stop the eBPF program:
183 | 
184 | ```
185 | tc filter del dev "${IFACE}" egress
186 | tc filter del dev "${IFACE}" ingress
187 | ```
188 | 
189 | ## MSS
190 | 
191 | Because the packets will be in UDP and not TCP, any MSS clamping will have no
192 | effects here. It is important to avoid IP fragmentation. In other words, it
193 | might be required to adapt the MTU (or the MSS).
194 | 


--------------------------------------------------------------------------------
/tcp_in_udp_tc.c:
--------------------------------------------------------------------------------
  1 | /* SPDX-License-Identifier: GPL-2.0 */
  2 | #include <linux/bpf.h>
  3 | #include <linux/if_ether.h>
  4 | #include <linux/in.h>
  5 | #include <linux/ip.h>
  6 | #include <linux/ipv6.h>
  7 | #include <linux/pkt_cls.h>
  8 | #include <linux/tcp.h>
  9 | #include <linux/udp.h>
 10 | #include <bpf/bpf_endian.h>
 11 | #include <bpf/bpf_helpers.h>
 12 | 
 13 | struct tcp_in_udp_hdr {
 14 | 	struct udphdr udphdr;
 15 | 	__be32	doff_flags_window;
 16 | 	__be32	seq;
 17 | 	__be32	ack_seq;
 18 | };
 19 | 
 20 | /* Header cursor to keep track of current parsing position */
 21 | struct hdr_cursor {
 22 | 	void *pos;
 23 | };
 24 | 
 25 | /*******************************************
 26 |  ** parse_*hdr helpers from XDP tutorials **
 27 |  *******************************************/
 28 | 
 29 | /* @return: next header */
 30 | static __always_inline int parse_ethhdr(struct hdr_cursor *nh,
 31 | 					void *data_end,
 32 | 					struct ethhdr **ethhdr)
 33 | {
 34 | 	struct ethhdr *eth = nh->pos;
 35 | 	int hdrsize = sizeof(*eth);
 36 | 
 37 | 	/* Byte-count bounds check; check if current pointer + size of header
 38 | 	 * is after data_end.
 39 | 	 */
 40 | 	if ((void *)eth + hdrsize > data_end)
 41 | 		return -1;
 42 | 
 43 | 	nh->pos += hdrsize;
 44 | 	*ethhdr = eth;
 45 | 
 46 | 	return eth->h_proto; /* network-byte-order */
 47 | }
 48 | 
 49 | /* @return: next header */
 50 | static __always_inline int parse_ip6hdr(struct hdr_cursor *nh,
 51 | 					void *data_end,
 52 | 					struct ipv6hdr **ip6hdr)
 53 | {
 54 | 	struct ipv6hdr *ip6h = nh->pos;
 55 | 	int hdrsize = sizeof(*ip6h);
 56 | 
 57 | 	if ((void *)ip6h + hdrsize > data_end)
 58 | 		return -1;
 59 | 
 60 | 	nh->pos += hdrsize;
 61 | 	*ip6hdr = ip6h;
 62 | 
 63 | 	/* TODO: support extensions */
 64 | 	return ip6h->nexthdr;
 65 | }
 66 | 
 67 | /* @return: next header */
 68 | static __always_inline int parse_iphdr(struct hdr_cursor *nh,
 69 | 				       void *data_end,
 70 | 				       struct iphdr **iphdr)
 71 | {
 72 | 	struct iphdr *iph = nh->pos;
 73 | 	int hdrsize = sizeof(*iph);
 74 | 
 75 | 	if ((void *)iph + hdrsize > data_end)
 76 | 		return -1;
 77 | 
 78 | 	hdrsize = iph->ihl << 2;
 79 | 	/* Sanity check packet field is valid */
 80 | 	if(hdrsize < sizeof(*iph))
 81 | 		return -1;
 82 | 
 83 | 	/* Variable-length IPv4 header, need to use byte-based arithmetic */
 84 | 	if (nh->pos + hdrsize > data_end)
 85 | 		return -1;
 86 | 
 87 | 	nh->pos += hdrsize;
 88 | 	*iphdr = iph;
 89 | 
 90 | 	return iph->protocol;
 91 | }
 92 | 
 93 | /* @return: header len */
 94 | static __always_inline int parse_tcphdr(struct hdr_cursor *nh,
 95 | 					void *data_end,
 96 | 					struct tcphdr **tcphdr)
 97 | {
 98 | 	struct tcphdr *tcph = nh->pos;
 99 | 	int hdrsize = sizeof(*tcph);
100 | 
101 | 	if ((void *)tcph + hdrsize > data_end)
102 | 		return -1;
103 | 
104 | 	hdrsize = tcph->doff << 2;
105 | 	/* Sanity check packet field is valid */
106 | 	if(hdrsize < sizeof(*tcph))
107 | 		return -1;
108 | 
109 | 	/* Variable-length TCP header, need to use byte-based arithmetic */
110 | 	if (nh->pos + hdrsize > data_end)
111 | 		return -1;
112 | 
113 | 	nh->pos += hdrsize;
114 | 	*tcphdr = tcph;
115 | 
116 | 	return hdrsize;
117 | }
118 | 
119 | /* @return: payload len */
120 | static __always_inline int parse_udphdr(struct hdr_cursor *nh,
121 | 					void *data_end,
122 | 					struct udphdr **udphdr)
123 | {
124 | 	struct udphdr *udph = nh->pos;
125 | 	int hdrsize = sizeof(*udph);
126 | 	int len;
127 | 
128 | 	if ((void *)udph + hdrsize > data_end)
129 | 		return -1;
130 | 
131 | 	nh->pos += hdrsize;
132 | 	*udphdr = udph;
133 | 
134 | 	len = bpf_ntohs(udph->len) - hdrsize;
135 | 	if (len < 0)
136 | 		return -1;
137 | 
138 | 	return len;
139 | }
140 | 
141 | 
142 | /*************
143 |  ** Ingress **
144 |  *************/
145 | 
146 | static __always_inline void
147 | udp_to_tcp(struct __sk_buff *skb, struct hdr_cursor *nh,
148 | 	   struct iphdr *iphdr, struct ipv6hdr *ipv6hdr)
149 | {
150 | 	void *data_end = (void *)(long)skb->data_end;
151 | 	void *data = (void *)(long)skb->data;
152 | 	struct tcp_in_udp_hdr *tuhdr, tuhdr_cpy;
153 | 	struct tcphdr tcphdr;
154 | 	int nh_off = nh->pos - data;
155 | 	__u8 proto = IPPROTO_TCP;
156 | 	__be16 zero = 0;
157 | 
158 | 	if (parse_udphdr(nh, data_end, (struct udphdr**)&tuhdr) < 0)
159 | 		goto out;
160 | 
161 | 	if (skb->gso_segs > 1) {
162 | 		bpf_printk("udp-tcp: WARNING, GRO/LRO should be disabled: length:%u, segs:%u, size:%u\n",
163 | 			   skb->len, skb->gso_segs, skb->gso_size);
164 | 		goto out;
165 | 	}
166 | 
167 | 	/* Load bytes, because we might only get the UDP header size in case the
168 | 	 * skb is non-linear. We could also pull the data, and get nh->pos again
169 | 	 */
170 | 	if (bpf_skb_load_bytes(skb, nh_off, &tuhdr_cpy, sizeof(struct tcphdr))) {
171 | 		bpf_printk("udp-tcp: WARNING: data_end too small: ulen:%u dlen:%u\n",
172 | 			   bpf_ntohs(tuhdr->udphdr.len), data_end - (void *)tuhdr);
173 | 		goto out;
174 | 	}
175 | 
176 | 	tcphdr.source = tuhdr_cpy.udphdr.source;
177 | 	tcphdr.dest = tuhdr_cpy.udphdr.dest;
178 | 	tcphdr.seq = tuhdr_cpy.seq;
179 | 	tcphdr.ack_seq = tuhdr_cpy.ack_seq;
180 | 	__builtin_memcpy((void *)&tcphdr + sizeof(__be32) * 3,
181 | 			 &tuhdr_cpy.doff_flags_window, sizeof(__be32));
182 | 	tcphdr.check = tuhdr_cpy.udphdr.check;
183 | 	bpf_skb_store_bytes(skb, nh_off, &tcphdr, sizeof(tcphdr), 0);
184 | 
185 | 	/* tcphdr->urg_ptr = 0; */
186 | 	bpf_skb_store_bytes(skb, nh_off + offsetof(struct tcphdr, urg_ptr),
187 | 			    &zero, sizeof(__be16), BPF_F_RECOMPUTE_CSUM);
188 | 
189 | 	/* Change protocol: UDP -> TCP */
190 | 	if (iphdr) {
191 | 		__be16 proto_old = bpf_htons(IPPROTO_UDP);
192 | 		__be16 proto_new = bpf_htons(IPPROTO_TCP);
193 | 		int ip_off = (void*)iphdr - data;
194 | 
195 | 		/* iphdr->protocol = IPPROTO_TCP; */
196 | 		bpf_skb_store_bytes(skb, ip_off + offsetof(struct iphdr, protocol),
197 | 				    &proto, sizeof(proto), BPF_F_RECOMPUTE_CSUM);
198 | 
199 | 		bpf_l3_csum_replace(skb, ((void*)iphdr - data) +
200 | 					  offsetof(struct iphdr, check),
201 | 				    proto_old, proto_new, sizeof(__be16));
202 | 		bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check),
203 | 				    proto_old, proto_new,
204 | 				    BPF_F_PSEUDO_HDR | sizeof(__be16));
205 | 	} else if (ipv6hdr) {
206 | 		__be32 proto_old = bpf_htonl(IPPROTO_UDP);
207 | 		__be32 proto_new = bpf_htonl(IPPROTO_TCP);
208 | 		int ipv6_off = (void*)ipv6hdr - data;
209 | 
210 | 		/* ipv6hdr->nexthdr = IPPROTO_TCP; */
211 | 		bpf_skb_store_bytes(skb, ipv6_off + offsetof(struct ipv6hdr, nexthdr),
212 | 				    &proto, sizeof(proto), BPF_F_RECOMPUTE_CSUM);
213 | 
214 | 		bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check),
215 | 				    proto_old, proto_new,
216 | 				    BPF_F_PSEUDO_HDR | sizeof(__be32));
217 | 	}
218 | 
219 | 	/* UDP Length vs Urgent Pointer */
220 | 	bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check),
221 | 			    tuhdr_cpy.udphdr.len, zero,
222 | 			    sizeof(__be16));
223 | 
224 | 	/* after mangling on headers through direct packet access */
225 | 	bpf_set_hash_invalid(skb);
226 | out:
227 | 	return;
228 | }
229 | 
230 | 
231 | /************
232 |  ** Egress **
233 |  ************/
234 | 
235 | static __always_inline int
236 | tcp_to_udp(struct __sk_buff *skb, struct hdr_cursor *nh,
237 | 	   struct iphdr *iphdr, struct ipv6hdr *ipv6hdr)
238 | {
239 | 	void *data_end = (void *)(long)skb->data_end;
240 | 	void *data = (void *)(long)skb->data;
241 | 	struct tcp_in_udp_hdr *tuhdr = nh->pos;
242 | 	struct tcphdr *tcphdr, tcphdr_cpy;
243 | 	int nh_off = nh->pos - data;
244 | 	__be16 udp_len, zero = 0;
245 | 	__be16 proto_old = bpf_htons(IPPROTO_TCP);
246 | 	__be16 proto_new = bpf_htons(IPPROTO_UDP);
247 | 
248 | 	if (parse_tcphdr(nh, data_end, &tcphdr) < 0)
249 | 		goto out;
250 | 
251 | 	if (tcphdr->urg) {
252 | 		if (iphdr)
253 | 			bpf_printk("tcp-udp: Skip: %pI4:%u -> %pI4:%u: urgent\n",
254 | 				   bpf_ntohl(iphdr->saddr),
255 | 				   bpf_ntohs(tcphdr->source),
256 | 				   bpf_ntohl(iphdr->daddr),
257 | 				   bpf_ntohs(tcphdr->dest));
258 | 		else if (ipv6hdr)
259 | 			bpf_printk("tcp-udp: Skip: %pI6c:%u -> %pI6c:%u: urgent\n",
260 | 				   &ipv6hdr->saddr,
261 | 				   bpf_ntohs(tcphdr->source),
262 | 				   &ipv6hdr->daddr,
263 | 				   bpf_ntohs(tcphdr->dest));
264 | 		goto out; /* TODO: or set to 0 and adapt checksum? */
265 | 	}
266 | 
267 | 	if (skb->gso_segs > 1) {
268 | 		bpf_printk("tcp-udp: WARNING, GSO/TSO should be disabled: length:%u, segs:%u, size:%u\n",
269 | 			   skb->len, skb->gso_segs, skb->gso_size);
270 | 		goto out;
271 | 	}
272 | 
273 | 	if (iphdr) {
274 | 		udp_len = bpf_htons(bpf_ntohs(iphdr->tot_len) -
275 | 				    ((void*)tcphdr - (void*)iphdr));
276 | 	} else if (ipv6hdr) {
277 | 		udp_len = ipv6hdr->payload_len;
278 | 	} else {
279 | 		goto out;
280 | 	}
281 | 
282 | 	/* Do the modification before calling bpf_...(skb) helpers which can
283 | 	 * modify the SKB and cause "invalid mem access 'scalar'" errors.
284 | 	 */
285 | 	__builtin_memcpy(&tcphdr_cpy, tcphdr, sizeof(struct tcphdr));
286 | 	tuhdr->udphdr.check = tcphdr_cpy.check;
287 | 	__builtin_memcpy(&tuhdr->doff_flags_window,
288 | 			 (void *)&tcphdr_cpy + sizeof(__be32) * 3, sizeof(__be32));
289 | 	tuhdr->seq = tcphdr_cpy.seq;
290 | 	tuhdr->ack_seq = tcphdr_cpy.ack_seq;
291 | 
292 | 	tuhdr->udphdr.len = udp_len;
293 | 
294 | 	/* Change protocol: TCP -> UDP */
295 | 	if (iphdr) {
296 | 		int ip_off = (void*)iphdr - data;
297 | 
298 | 		iphdr->protocol = IPPROTO_UDP;
299 | 
300 | 		bpf_l3_csum_replace(skb, ip_off + offsetof(struct iphdr, check),
301 | 				    proto_old, proto_new, sizeof(__be16));
302 | 	} else if (ipv6hdr) {
303 | 		ipv6hdr->nexthdr = IPPROTO_UDP;
304 | 	}
305 | 	bpf_l4_csum_replace(skb, nh_off + offsetof(struct udphdr, check),
306 | 			    proto_old, proto_new, sizeof(__be16) | BPF_F_PSEUDO_HDR);
307 | 
308 | 	/* UDP Length vs Urgent Pointer */
309 | 	bpf_l4_csum_replace(skb, nh_off + offsetof(struct udphdr, check),
310 | 			    zero, udp_len, sizeof(__be16));
311 | 
312 | 	return TC_ACT_PIPE;
313 | out:
314 | 	return TC_ACT_OK;
315 | }
316 | 
317 | SEC("tc")
318 | int tc_tcp_in_udp_l2(struct __sk_buff *skb)
319 | {
320 | 	void *data_end = (void *)(long)skb->data_end;
321 | 	void *data = (void *)(long)skb->data;
322 | 	struct hdr_cursor nh = { .pos = data };
323 | 	int eth_type, ip_type, ret = TC_ACT_OK;
324 | 	struct ipv6hdr *ipv6hdr = NULL;
325 | 	struct iphdr *iphdr = NULL;
326 | 	struct ethhdr *eth;
327 | 
328 | 	eth_type = parse_ethhdr(&nh, data_end, &eth);
329 | 	if (eth_type == bpf_htons(ETH_P_IP))
330 | 		ip_type = parse_iphdr(&nh, data_end, &iphdr);
331 | 	else if (eth_type == bpf_htons(ETH_P_IPV6))
332 | 		ip_type = parse_ip6hdr(&nh, data_end, &ipv6hdr);
333 | 	else
334 | 		goto out;
335 | 
336 | 	if (ip_type == IPPROTO_TCP)
337 | 		return tcp_to_udp(skb, &nh, iphdr, ipv6hdr);
338 | 	if (ip_type == IPPROTO_UDP)
339 | 		udp_to_tcp(skb, &nh, iphdr, ipv6hdr);
340 | 
341 | out:
342 | 	return ret;
343 | }
344 | 
345 | SEC("tc_l3")
346 | int tc_tcp_in_udp_l3(struct __sk_buff *skb)
347 | {
348 | 	void *data_end = (void *)(long)skb->data_end;
349 | 	void *data = (void *)(long)skb->data;
350 | 	struct hdr_cursor nh = { .pos = data };
351 | 	int ip_type, ret = TC_ACT_OK;
352 | 	struct ipv6hdr *ipv6hdr = NULL;
353 | 	struct iphdr *iphdr = NULL;
354 | 
355 | 	if (skb->protocol == bpf_htons(ETH_P_IP))
356 | 		ip_type = parse_iphdr(&nh, data_end, &iphdr);
357 | 	else if (skb->protocol == bpf_htons(ETH_P_IPV6))
358 | 		ip_type = parse_ip6hdr(&nh, data_end, &ipv6hdr);
359 | 	else
360 | 		goto out;
361 | 
362 | 	if (ip_type == IPPROTO_TCP)
363 | 		return tcp_to_udp(skb, &nh, iphdr, ipv6hdr);
364 | 	if (ip_type == IPPROTO_UDP)
365 | 		udp_to_tcp(skb, &nh, iphdr, ipv6hdr);
366 | 
367 | out:
368 | 	return ret;
369 | }
370 | 
371 | char _license[] SEC("license") = "GPL";
372 | 


--------------------------------------------------------------------------------