├── .gitignore ├── Makefile ├── deploy-in-ns.sh ├── test.sh ├── README.md └── tcp_in_udp_tc.c /.gitignore: -------------------------------------------------------------------------------- 1 | /compile_commands.json 2 | *.o 3 | *.pcap 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS = -O2 -g -Wall -target bpf # -Werror 2 | CC = clang 3 | 4 | all: tcp_in_udp_tc.o 5 | .PHONY: all 6 | 7 | %.o: %.c 8 | ${CC} ${CFLAGS} -c $^ -o $@ -MJ compile_commands.json 9 | 10 | clean: 11 | rm -f *.o 12 | -------------------------------------------------------------------------------- /deploy-in-ns.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # SPDX-License-Identifier: GPL-2.0 3 | # Copyright (c) 2025, Matthieu Baerts. 4 | 5 | export NS=tcp 6 | 7 | cleanup() 8 | { 9 | return 10 | 11 | local suffix 12 | ip netns pids "${NS}" | xargs -r kill 13 | ip netns del "${NS}" >/dev/null 2>&1 14 | } 15 | 16 | trap cleanup EXIT 17 | 18 | server() 19 | { 20 | ip netns exec "${NS}" iperf3 -s -D 21 | sleep 1 # making sure the daemon is launched 22 | } 23 | 24 | tc_setup() 25 | { 26 | local ns="${NS}" iface="nshost" 27 | 28 | # ip netns will umount everything on exit 29 | ip netns exec "${ns}" sh -c "mount -t debugfs none /sys/kernel/debug && cat /sys/kernel/debug/tracing/trace_pipe" & 30 | 31 | tc -n "${ns}" qdisc add dev "${iface}" clsact 32 | tc -n "${ns}" filter add dev "${iface}" egress bpf da obj tcp_in_udp_tc.o sec tc_egress 33 | tc -n "${ns}" filter add dev "${iface}" ingress bpf da obj tcp_in_udp_tc.o sec tc_ingress 34 | 35 | tc -n "${ns}" filter show dev "${iface}" egress 36 | tc -n "${ns}" filter show dev "${iface}" ingress 37 | 38 | ip netns exec "${ns}" ethtool -K "${iface}" gro off gso off tso off lro off ufo off sg off 39 | ethtool -K "eth0" gro off gso off tso off lro off ufo off sg off 40 | } 41 | 42 | capture() 43 | { 44 | ip netns exec "${NS}" tcpdump -i nshost -s 100 -w ns.pcap tcp or udp 45 | } 46 | 47 | setup() 48 | { 49 | ip netns add "${NS}" 50 | ip -n "${NS}" link set lo up 51 | 52 | ip link add hostns type veth peer name nshost 53 | ip link set nshost netns "${NS}" 54 | 55 | ip link set hostns up 56 | ip -n "${NS}" link set nshost up 57 | 58 | ip addr add 10.0.42.1/24 dev hostns 59 | ip -n "${NS}" addr add 10.0.42.2/24 dev nshost 60 | 61 | ip -n "${NS}" route add default via 10.0.42.1 dev nshost 62 | 63 | # TODO: forward port 5201 + masquerade 64 | } 65 | 66 | setup 67 | server 68 | # capture 69 | 70 | tc_setup 71 | 72 | case "${1}" in 73 | *) 74 | export -f capture 75 | ip netns exec ${NS}_cli sh -c "mount -t debugfs none /sys/kernel/debug && bash" 76 | ;; 77 | esac 78 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # SPDX-License-Identifier: GPL-2.0 3 | # Copyright (c) 2025, Matthieu Baerts. 4 | 5 | export NS=tcp 6 | export HOSTS=(cli cpe int net srv) 7 | 8 | netns() 9 | { 10 | local suffix 11 | local nss=() 12 | for suffix in "${HOSTS[@]}"; do 13 | nss+=("${NS}_${suffix}") 14 | done 15 | echo "${nss[@]}" 16 | } 17 | 18 | cleanup() 19 | { 20 | tc -n "${NS}_cpe" -s action show action csum 21 | tc -n "${NS}_net" -s action show action csum 22 | 23 | local suffix 24 | for suffix in "${HOSTS[@]}"; do 25 | local ns="${NS}_${suffix}" 26 | echo "== ${suffix} ==" 27 | ip netns exec "${ns}" nstat 28 | ip netns pids "${ns}" | xargs -r kill 29 | ip netns del "${ns}" >/dev/null 2>&1 30 | done 31 | } 32 | 33 | trap cleanup EXIT 34 | 35 | server() 36 | { 37 | ip netns exec "${NS}_srv" iperf3 -s -D 38 | sleep .1 # making sure the daemon is launched 39 | } 40 | 41 | tc_client() 42 | { 43 | local ns="${NS}_cpe" iface="int" port="5201" 44 | 45 | # ip netns will umount everything on exit 46 | ip netns exec "${ns}" sh -c "mount -t debugfs none /sys/kernel/debug && cat /sys/kernel/debug/tracing/trace_pipe" & 47 | 48 | tc -n "${ns}" qdisc add dev "${iface}" clsact 49 | tc -n "${ns}" filter add dev "${iface}" egress u32 match tcp dst "${port}" 0xffff action goto chain 1 50 | tc -n "${ns}" filter add dev "${iface}" egress chain 1 bpf object-file tcp_in_udp_tc.o section tc action csum udp 51 | tc -n "${ns}" filter add dev "${iface}" ingress u32 match udp src "${port}" 0xffff action goto chain 1 52 | tc -n "${ns}" filter add dev "${iface}" ingress chain 1 bpf object-file tcp_in_udp_tc.o section tc direct-action 53 | 54 | tc -n "${ns}" filter show dev "${iface}" egress 55 | tc -n "${ns}" filter show dev "${iface}" ingress 56 | 57 | ip -n "${NS}_cli" link set "cpe" gso_max_segs 0 58 | } 59 | 60 | tc_server() 61 | { 62 | local ns="${NS}_net" iface="int" port="5201" 63 | 64 | # ip netns will umount everything on exit 65 | ip netns exec "${ns}" sh -c "mount -t debugfs none /sys/kernel/debug && cat /sys/kernel/debug/tracing/trace_pipe" & 66 | 67 | tc -n "${ns}" qdisc add dev "${iface}" clsact 68 | tc -n "${ns}" filter add dev "${iface}" egress u32 match tcp src "${port}" 0xffff action goto chain 1 69 | tc -n "${ns}" filter add dev "${iface}" egress chain 1 bpf object-file tcp_in_udp_tc.o section tc action csum udp 70 | tc -n "${ns}" filter add dev "${iface}" ingress u32 match udp dst "${port}" 0xffff action goto chain 1 71 | tc -n "${ns}" filter add dev "${iface}" ingress chain 1 bpf object-file tcp_in_udp_tc.o section tc direct-action 72 | 73 | tc -n "${ns}" filter show dev "${iface}" egress 74 | tc -n "${ns}" filter show dev "${iface}" ingress 75 | 76 | ip -n "${NS}_srv" link set "net" gso_max_segs 0 77 | } 78 | 79 | capture() 80 | { 81 | ip netns exec "${NS}_cli" tcpdump -i cpe -s 100 -w cli_cpe.pcap tcp or udp & 82 | ip netns exec "${NS}_int" tcpdump -i cpe -s 100 -w int_cpe.pcap tcp or udp & 83 | ip netns exec "${NS}_int" tcpdump -i net -s 100 -w int_net.pcap tcp or udp & 84 | ip netns exec "${NS}_srv" tcpdump -i net -s 100 -w srv_net.pcap tcp or udp & 85 | } 86 | 87 | setup() 88 | { 89 | local suffix 90 | for suffix in "${HOSTS[@]}"; do 91 | local ns="${NS}_${suffix}" 92 | ip netns add "${ns}" 93 | ip -n "${ns}" link set lo up 94 | done 95 | 96 | # .0.2 .0.1 .1.2 .1.1 .3.2 .3.1 .2.1 .2.2 97 | # cli -------- cpe -------- int -------- net -------- srv 98 | 99 | ip link add "cli" netns "${NS}_cpe" type veth peer name "cpe" netns "${NS}_cli" 100 | ip link add "cpe" netns "${NS}_int" type veth peer name "int" netns "${NS}_cpe" 101 | ip link add "int" netns "${NS}_net" type veth peer name "net" netns "${NS}_int" 102 | ip link add "net" netns "${NS}_srv" type veth peer name "srv" netns "${NS}_net" 103 | 104 | ip -n "${NS}_cli" link set "cpe" up 105 | ip -n "${NS}_cli" addr add dev "cpe" 10.0.0.2/24 106 | ip -n "${NS}_cli" route add default via 10.0.0.1 dev "cpe" 107 | 108 | ip -n "${NS}_cpe" link set "cli" up 109 | ip -n "${NS}_cpe" addr add dev "cli" 10.0.0.1/24 110 | ip -n "${NS}_cpe" link set "int" up 111 | ip -n "${NS}_cpe" addr add dev "int" 10.0.1.2/24 112 | ip -n "${NS}_cpe" route add default via 10.0.1.1 dev "int" 113 | 114 | ip -n "${NS}_int" link set "cpe" up 115 | ip -n "${NS}_int" addr add dev "cpe" 10.0.1.1/24 116 | #tc -n "${NS}_int" qdisc add dev "cpe" root netem rate 10mbit delay 5ms 117 | ip -n "${NS}_int" link set "net" up 118 | ip -n "${NS}_int" addr add dev "net" 10.0.3.2/24 119 | #tc -n "${NS}_int" qdisc add dev "net" root netem rate 10mbit delay 5ms 120 | ip -n "${NS}_int" route add 10.0.0.0/24 via 10.0.1.2 dev "cpe" 121 | ip -n "${NS}_int" route add 10.0.2.0/24 via 10.0.3.1 dev "net" 122 | 123 | ip -n "${NS}_net" link set "int" up 124 | ip -n "${NS}_net" addr add dev "int" 10.0.3.1/24 125 | ip -n "${NS}_net" link set "srv" up 126 | ip -n "${NS}_net" addr add dev "srv" 10.0.2.1/24 127 | ip -n "${NS}_net" route add default via 10.0.3.2 dev "int" 128 | 129 | ip -n "${NS}_srv" link set "net" up 130 | ip -n "${NS}_srv" addr add dev "net" 10.0.2.2/24 131 | ip -n "${NS}_srv" route add default via 10.0.2.1 dev "net" 132 | } 133 | 134 | setup 135 | server 136 | # capture 137 | 138 | tc_client 139 | tc_server 140 | 141 | case "${1}" in 142 | *) 143 | export -f tc_client tc_server 144 | echo -e "\n\tNetns: $(netns)\n\tUse 'ip netns exec ' to execute a command in the netns.\n\tServer: iperf3 -c 10.0.2.2 -R\n" 145 | PS1="client# " ip netns exec ${NS}_cli sh -c "mount -t debugfs none /sys/kernel/debug && bash" 146 | ;; 147 | esac 148 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TCP in UDP 2 | 3 | Middleboxes can mess up with TCP flows, e.g. intercepting the connections and 4 | dropping MPTCP options. Using an TCP-in-UDP tunnel will force such middleboxes 5 | not to modify such TCP connections. The idea here is inspired by an old [IETF 6 | draft](https://datatracker.ietf.org/doc/html/draft-cheshire-tcp-over-udp-00.html). 7 | 8 | This "tunnel" is done in eBPF, from the TC hooks. For more details about why it 9 | has been created, and its particularities, please check this 10 | [blog post](https://blog.mptcp.dev/2025/07/14/TCP-in-UDP.html). 11 | 12 | ## Headers 13 | 14 | [UDP](https://www.ietf.org/rfc/rfc768.html): 15 | 16 | ``` 17 | 0 1 2 3 18 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 19 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 20 | | Source Port | Destination Port | 21 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 22 | | Length | Checksum | 23 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 24 | ``` 25 | 26 | [TCP](https://www.ietf.org/rfc/rfc9293.html): 27 | 28 | ``` 29 | 0 1 2 3 30 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 31 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 32 | | Source Port | Destination Port | 33 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 34 | | Sequence Number | 35 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 36 | | Acknowledgment Number | 37 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 38 | | Data | |C|E|U|A|P|R|S|F| | 39 | | Offset| Reser |R|C|R|C|S|S|Y|I| Window | 40 | | | |W|E|G|K|H|T|N|N| | 41 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 42 | | Checksum | Urgent Pointer | 43 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 44 | | (Optional) Options | 45 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 46 | ``` 47 | 48 | [TCP-in-UDP](https://datatracker.ietf.org/doc/html/draft-cheshire-tcp-over-udp-00.html): 49 | 50 | ``` 51 | 0 1 2 3 52 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 53 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 54 | | Source Port | Destination Port | 55 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 56 | | Length | Checksum | 57 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 58 | | Data | |C|E| |A|P|R|S|F| | 59 | | Offset| Reser |R|C|0|C|S|S|Y|I| Window | 60 | | | |W|E| |K|H|T|N|N| | 61 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62 | | Sequence Number | 63 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64 | | Acknowledgment Number | 65 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 66 | | (Optional) Options | 67 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 68 | ``` 69 | 70 | Modifications: 71 | - `URG` set to 0, `Urgent Pointer` is supposed to be zero (not used). 72 | - Switch `Sequence Number` and `Acknowledgment Number` with `Urgent Pointer` and 73 | `Checksum`. 74 | - Replace `Urgent Pointer` by the `Length`: Checksum needs to be recomputed. 75 | 76 | Checksum: 77 | - No need to recompute it from scratch, it can be derived from the previous 78 | values, by just changing the protocol. 79 | 80 | - [UDP Checksum](https://www.rfc-editor.org/rfc/rfc768) computed from: 81 | - Source and destination address: from upper layer 82 | - Protocol (1B): UDP (17) 83 | - Length (2B): Data (variable) + UDP header (8 octets) lengths 84 | - TCP header 85 | - Data 86 | 87 | - [TCP Checksum](https://www.ietf.org/rfc/rfc9293.html#section-3.1-6.18.1) 88 | computed from: 89 | - Source and destination address: from upper layer 90 | - Protocol (1B): TCP (6) 91 | - Length (2B): Data (variable) + TCP header (Between 20 and 56 octets) lengths 92 | - TCP header 93 | - Data 94 | 95 | - Differences: 96 | - Source and destination address: not changed 97 | - Protocol: **changed**: UDP/TCP. 98 | - Data length: not changed 99 | - L4 header: **changed**: `UDP Length` vs `TCP Urgent Pointer` 100 | - Data: not changed 101 | 102 | 103 | ## Build 104 | 105 | Build the binary using `make`. CLang, `libelf`, `libc6`, and `libbpf` are 106 | required: 107 | 108 | ``` 109 | sudo apt install make clang libelf-dev libc6-dev-i386 libbpf-dev 110 | ``` 111 | 112 | 113 | ## Setup 114 | 115 | Load it with `tc` commands: 116 | 117 | - Client: 118 | ``` 119 | tc qdisc add dev "${IFACE}" clsact 120 | tc filter add dev "${IFACE}" egress u32 match ip dport "${PORT}" 0xffff action goto chain 1 121 | tc filter add dev "${IFACE}" egress chain 1 bpf object-file tcp_in_udp_tc.o section tc action csum udp 122 | tc filter add dev "${IFACE}" ingress u32 match ip sport "${PORT}" 0xffff action goto chain 1 123 | tc filter add dev "${IFACE}" ingress chain 1 bpf object-file tcp_in_udp_tc.o section tc direct-action 124 | ``` 125 | - Server: 126 | ``` 127 | tc qdisc add dev "${IFACE}" clsact 128 | tc filter add dev "${IFACE}" egress u32 match ip sport "${PORT}" 0xffff action goto chain 1 129 | tc filter add dev "${IFACE}" egress chain 1 bpf object-file tcp_in_udp_tc.o section tc action csum udp 130 | tc filter add dev "${IFACE}" ingress u32 match ip dport "${PORT}" 0xffff action goto chain 1 131 | tc filter add dev "${IFACE}" ingress chain 1 bpf object-file tcp_in_udp_tc.o section tc direct-action 132 | ``` 133 | 134 | On layer 3 interfaces, use the ELF section called `tc_l3`. 135 | 136 | Multiple u32 filters can be used to have more than one port traffic sent to the 137 | BPF program. 138 | 139 | If the TCP program supports setting marks (`SO_MARK`), use it for egress to 140 | prevent processing traffic that is not from the TCP program. For client, this 141 | allows traffic to a different IP address with the same TCP port. For server, 142 | this prevents sending packet to BPF program if the interface has multiple IP 143 | addresses assigned and if the TCP program doesn't bind to all of them. 144 | 145 | - Client & Server: 146 | ``` 147 | tc filter add dev "${IFACE}" egress handle 2 fw action goto chain 1 148 | ``` 149 | 150 | Be warned that `SO_MARK` can't be used for ingress as the system doesn't expect 151 | incoming UDP packets. Therefore, all incoming packets from the interface with 152 | matching port will be sent to the BPF program. To decrease the chance of this 153 | happening, you're recommended to use ports that are outside the ephemeral port 154 | range set on `net.ipv4.ip_local_port_range` (default: 32768-60999). This option 155 | applies to IPv6 too. 156 | 157 | Generic Segmentation Offload (GSO) and Generic Receive Offload (GRO) cannot be 158 | used for this traffic, because each UDP packet will carry a part of the TCP 159 | headers as part of the data. This part of the data is specific to one packet, 160 | therefore, it cannot be merged with the next data. UDP GRO is only done on 161 | demand, e.g. when the userspace asks it (`setsockopt(IPPROTO_UDP, UDP_GRO)`) or 162 | for some in-kernel tunnels, so GRO doesn't need to be disabled. To disable GSO: 163 | 164 | ``` 165 | ip link set ${IFACE} gso_max_segs 0 166 | ``` 167 | 168 | Note: to get some stats, in egress, it is possible to use: 169 | 170 | ``` 171 | tc -s action show action csum 172 | tc -s -j action show action csum | jq 173 | ``` 174 | 175 | It might be interesting to monitor the tracing ring buffer for warnings and 176 | other messages generated by the eBPF program: 177 | 178 | ``` 179 | cat /sys/kernel/debug/tracing/trace_pipe 180 | ``` 181 | 182 | To stop the eBPF program: 183 | 184 | ``` 185 | tc filter del dev "${IFACE}" egress 186 | tc filter del dev "${IFACE}" ingress 187 | ``` 188 | 189 | ## MSS 190 | 191 | Because the packets will be in UDP and not TCP, any MSS clamping will have no 192 | effects here. It is important to avoid IP fragmentation. In other words, it 193 | might be required to adapt the MTU (or the MSS). 194 | -------------------------------------------------------------------------------- /tcp_in_udp_tc.c: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | struct tcp_in_udp_hdr { 14 | struct udphdr udphdr; 15 | __be32 doff_flags_window; 16 | __be32 seq; 17 | __be32 ack_seq; 18 | }; 19 | 20 | /* Header cursor to keep track of current parsing position */ 21 | struct hdr_cursor { 22 | void *pos; 23 | }; 24 | 25 | /******************************************* 26 | ** parse_*hdr helpers from XDP tutorials ** 27 | *******************************************/ 28 | 29 | /* @return: next header */ 30 | static __always_inline int parse_ethhdr(struct hdr_cursor *nh, 31 | void *data_end, 32 | struct ethhdr **ethhdr) 33 | { 34 | struct ethhdr *eth = nh->pos; 35 | int hdrsize = sizeof(*eth); 36 | 37 | /* Byte-count bounds check; check if current pointer + size of header 38 | * is after data_end. 39 | */ 40 | if ((void *)eth + hdrsize > data_end) 41 | return -1; 42 | 43 | nh->pos += hdrsize; 44 | *ethhdr = eth; 45 | 46 | return eth->h_proto; /* network-byte-order */ 47 | } 48 | 49 | /* @return: next header */ 50 | static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, 51 | void *data_end, 52 | struct ipv6hdr **ip6hdr) 53 | { 54 | struct ipv6hdr *ip6h = nh->pos; 55 | int hdrsize = sizeof(*ip6h); 56 | 57 | if ((void *)ip6h + hdrsize > data_end) 58 | return -1; 59 | 60 | nh->pos += hdrsize; 61 | *ip6hdr = ip6h; 62 | 63 | /* TODO: support extensions */ 64 | return ip6h->nexthdr; 65 | } 66 | 67 | /* @return: next header */ 68 | static __always_inline int parse_iphdr(struct hdr_cursor *nh, 69 | void *data_end, 70 | struct iphdr **iphdr) 71 | { 72 | struct iphdr *iph = nh->pos; 73 | int hdrsize = sizeof(*iph); 74 | 75 | if ((void *)iph + hdrsize > data_end) 76 | return -1; 77 | 78 | hdrsize = iph->ihl << 2; 79 | /* Sanity check packet field is valid */ 80 | if(hdrsize < sizeof(*iph)) 81 | return -1; 82 | 83 | /* Variable-length IPv4 header, need to use byte-based arithmetic */ 84 | if (nh->pos + hdrsize > data_end) 85 | return -1; 86 | 87 | nh->pos += hdrsize; 88 | *iphdr = iph; 89 | 90 | return iph->protocol; 91 | } 92 | 93 | /* @return: header len */ 94 | static __always_inline int parse_tcphdr(struct hdr_cursor *nh, 95 | void *data_end, 96 | struct tcphdr **tcphdr) 97 | { 98 | struct tcphdr *tcph = nh->pos; 99 | int hdrsize = sizeof(*tcph); 100 | 101 | if ((void *)tcph + hdrsize > data_end) 102 | return -1; 103 | 104 | hdrsize = tcph->doff << 2; 105 | /* Sanity check packet field is valid */ 106 | if(hdrsize < sizeof(*tcph)) 107 | return -1; 108 | 109 | /* Variable-length TCP header, need to use byte-based arithmetic */ 110 | if (nh->pos + hdrsize > data_end) 111 | return -1; 112 | 113 | nh->pos += hdrsize; 114 | *tcphdr = tcph; 115 | 116 | return hdrsize; 117 | } 118 | 119 | /* @return: payload len */ 120 | static __always_inline int parse_udphdr(struct hdr_cursor *nh, 121 | void *data_end, 122 | struct udphdr **udphdr) 123 | { 124 | struct udphdr *udph = nh->pos; 125 | int hdrsize = sizeof(*udph); 126 | int len; 127 | 128 | if ((void *)udph + hdrsize > data_end) 129 | return -1; 130 | 131 | nh->pos += hdrsize; 132 | *udphdr = udph; 133 | 134 | len = bpf_ntohs(udph->len) - hdrsize; 135 | if (len < 0) 136 | return -1; 137 | 138 | return len; 139 | } 140 | 141 | 142 | /************* 143 | ** Ingress ** 144 | *************/ 145 | 146 | static __always_inline void 147 | udp_to_tcp(struct __sk_buff *skb, struct hdr_cursor *nh, 148 | struct iphdr *iphdr, struct ipv6hdr *ipv6hdr) 149 | { 150 | void *data_end = (void *)(long)skb->data_end; 151 | void *data = (void *)(long)skb->data; 152 | struct tcp_in_udp_hdr *tuhdr, tuhdr_cpy; 153 | struct tcphdr tcphdr; 154 | int nh_off = nh->pos - data; 155 | __u8 proto = IPPROTO_TCP; 156 | __be16 zero = 0; 157 | 158 | if (parse_udphdr(nh, data_end, (struct udphdr**)&tuhdr) < 0) 159 | goto out; 160 | 161 | if (skb->gso_segs > 1) { 162 | bpf_printk("udp-tcp: WARNING, GRO/LRO should be disabled: length:%u, segs:%u, size:%u\n", 163 | skb->len, skb->gso_segs, skb->gso_size); 164 | goto out; 165 | } 166 | 167 | /* Load bytes, because we might only get the UDP header size in case the 168 | * skb is non-linear. We could also pull the data, and get nh->pos again 169 | */ 170 | if (bpf_skb_load_bytes(skb, nh_off, &tuhdr_cpy, sizeof(struct tcphdr))) { 171 | bpf_printk("udp-tcp: WARNING: data_end too small: ulen:%u dlen:%u\n", 172 | bpf_ntohs(tuhdr->udphdr.len), data_end - (void *)tuhdr); 173 | goto out; 174 | } 175 | 176 | tcphdr.source = tuhdr_cpy.udphdr.source; 177 | tcphdr.dest = tuhdr_cpy.udphdr.dest; 178 | tcphdr.seq = tuhdr_cpy.seq; 179 | tcphdr.ack_seq = tuhdr_cpy.ack_seq; 180 | __builtin_memcpy((void *)&tcphdr + sizeof(__be32) * 3, 181 | &tuhdr_cpy.doff_flags_window, sizeof(__be32)); 182 | tcphdr.check = tuhdr_cpy.udphdr.check; 183 | bpf_skb_store_bytes(skb, nh_off, &tcphdr, sizeof(tcphdr), 0); 184 | 185 | /* tcphdr->urg_ptr = 0; */ 186 | bpf_skb_store_bytes(skb, nh_off + offsetof(struct tcphdr, urg_ptr), 187 | &zero, sizeof(__be16), BPF_F_RECOMPUTE_CSUM); 188 | 189 | /* Change protocol: UDP -> TCP */ 190 | if (iphdr) { 191 | __be16 proto_old = bpf_htons(IPPROTO_UDP); 192 | __be16 proto_new = bpf_htons(IPPROTO_TCP); 193 | int ip_off = (void*)iphdr - data; 194 | 195 | /* iphdr->protocol = IPPROTO_TCP; */ 196 | bpf_skb_store_bytes(skb, ip_off + offsetof(struct iphdr, protocol), 197 | &proto, sizeof(proto), BPF_F_RECOMPUTE_CSUM); 198 | 199 | bpf_l3_csum_replace(skb, ((void*)iphdr - data) + 200 | offsetof(struct iphdr, check), 201 | proto_old, proto_new, sizeof(__be16)); 202 | bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check), 203 | proto_old, proto_new, 204 | BPF_F_PSEUDO_HDR | sizeof(__be16)); 205 | } else if (ipv6hdr) { 206 | __be32 proto_old = bpf_htonl(IPPROTO_UDP); 207 | __be32 proto_new = bpf_htonl(IPPROTO_TCP); 208 | int ipv6_off = (void*)ipv6hdr - data; 209 | 210 | /* ipv6hdr->nexthdr = IPPROTO_TCP; */ 211 | bpf_skb_store_bytes(skb, ipv6_off + offsetof(struct ipv6hdr, nexthdr), 212 | &proto, sizeof(proto), BPF_F_RECOMPUTE_CSUM); 213 | 214 | bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check), 215 | proto_old, proto_new, 216 | BPF_F_PSEUDO_HDR | sizeof(__be32)); 217 | } 218 | 219 | /* UDP Length vs Urgent Pointer */ 220 | bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check), 221 | tuhdr_cpy.udphdr.len, zero, 222 | sizeof(__be16)); 223 | 224 | /* after mangling on headers through direct packet access */ 225 | bpf_set_hash_invalid(skb); 226 | out: 227 | return; 228 | } 229 | 230 | 231 | /************ 232 | ** Egress ** 233 | ************/ 234 | 235 | static __always_inline int 236 | tcp_to_udp(struct __sk_buff *skb, struct hdr_cursor *nh, 237 | struct iphdr *iphdr, struct ipv6hdr *ipv6hdr) 238 | { 239 | void *data_end = (void *)(long)skb->data_end; 240 | void *data = (void *)(long)skb->data; 241 | struct tcp_in_udp_hdr *tuhdr = nh->pos; 242 | struct tcphdr *tcphdr, tcphdr_cpy; 243 | int nh_off = nh->pos - data; 244 | __be16 udp_len, zero = 0; 245 | __be16 proto_old = bpf_htons(IPPROTO_TCP); 246 | __be16 proto_new = bpf_htons(IPPROTO_UDP); 247 | 248 | if (parse_tcphdr(nh, data_end, &tcphdr) < 0) 249 | goto out; 250 | 251 | if (tcphdr->urg) { 252 | if (iphdr) 253 | bpf_printk("tcp-udp: Skip: %pI4:%u -> %pI4:%u: urgent\n", 254 | bpf_ntohl(iphdr->saddr), 255 | bpf_ntohs(tcphdr->source), 256 | bpf_ntohl(iphdr->daddr), 257 | bpf_ntohs(tcphdr->dest)); 258 | else if (ipv6hdr) 259 | bpf_printk("tcp-udp: Skip: %pI6c:%u -> %pI6c:%u: urgent\n", 260 | &ipv6hdr->saddr, 261 | bpf_ntohs(tcphdr->source), 262 | &ipv6hdr->daddr, 263 | bpf_ntohs(tcphdr->dest)); 264 | goto out; /* TODO: or set to 0 and adapt checksum? */ 265 | } 266 | 267 | if (skb->gso_segs > 1) { 268 | bpf_printk("tcp-udp: WARNING, GSO/TSO should be disabled: length:%u, segs:%u, size:%u\n", 269 | skb->len, skb->gso_segs, skb->gso_size); 270 | goto out; 271 | } 272 | 273 | if (iphdr) { 274 | udp_len = bpf_htons(bpf_ntohs(iphdr->tot_len) - 275 | ((void*)tcphdr - (void*)iphdr)); 276 | } else if (ipv6hdr) { 277 | udp_len = ipv6hdr->payload_len; 278 | } else { 279 | goto out; 280 | } 281 | 282 | /* Do the modification before calling bpf_...(skb) helpers which can 283 | * modify the SKB and cause "invalid mem access 'scalar'" errors. 284 | */ 285 | __builtin_memcpy(&tcphdr_cpy, tcphdr, sizeof(struct tcphdr)); 286 | tuhdr->udphdr.check = tcphdr_cpy.check; 287 | __builtin_memcpy(&tuhdr->doff_flags_window, 288 | (void *)&tcphdr_cpy + sizeof(__be32) * 3, sizeof(__be32)); 289 | tuhdr->seq = tcphdr_cpy.seq; 290 | tuhdr->ack_seq = tcphdr_cpy.ack_seq; 291 | 292 | tuhdr->udphdr.len = udp_len; 293 | 294 | /* Change protocol: TCP -> UDP */ 295 | if (iphdr) { 296 | int ip_off = (void*)iphdr - data; 297 | 298 | iphdr->protocol = IPPROTO_UDP; 299 | 300 | bpf_l3_csum_replace(skb, ip_off + offsetof(struct iphdr, check), 301 | proto_old, proto_new, sizeof(__be16)); 302 | } else if (ipv6hdr) { 303 | ipv6hdr->nexthdr = IPPROTO_UDP; 304 | } 305 | bpf_l4_csum_replace(skb, nh_off + offsetof(struct udphdr, check), 306 | proto_old, proto_new, sizeof(__be16) | BPF_F_PSEUDO_HDR); 307 | 308 | /* UDP Length vs Urgent Pointer */ 309 | bpf_l4_csum_replace(skb, nh_off + offsetof(struct udphdr, check), 310 | zero, udp_len, sizeof(__be16)); 311 | 312 | return TC_ACT_PIPE; 313 | out: 314 | return TC_ACT_OK; 315 | } 316 | 317 | SEC("tc") 318 | int tc_tcp_in_udp_l2(struct __sk_buff *skb) 319 | { 320 | void *data_end = (void *)(long)skb->data_end; 321 | void *data = (void *)(long)skb->data; 322 | struct hdr_cursor nh = { .pos = data }; 323 | int eth_type, ip_type, ret = TC_ACT_OK; 324 | struct ipv6hdr *ipv6hdr = NULL; 325 | struct iphdr *iphdr = NULL; 326 | struct ethhdr *eth; 327 | 328 | eth_type = parse_ethhdr(&nh, data_end, ð); 329 | if (eth_type == bpf_htons(ETH_P_IP)) 330 | ip_type = parse_iphdr(&nh, data_end, &iphdr); 331 | else if (eth_type == bpf_htons(ETH_P_IPV6)) 332 | ip_type = parse_ip6hdr(&nh, data_end, &ipv6hdr); 333 | else 334 | goto out; 335 | 336 | if (ip_type == IPPROTO_TCP) 337 | return tcp_to_udp(skb, &nh, iphdr, ipv6hdr); 338 | if (ip_type == IPPROTO_UDP) 339 | udp_to_tcp(skb, &nh, iphdr, ipv6hdr); 340 | 341 | out: 342 | return ret; 343 | } 344 | 345 | SEC("tc_l3") 346 | int tc_tcp_in_udp_l3(struct __sk_buff *skb) 347 | { 348 | void *data_end = (void *)(long)skb->data_end; 349 | void *data = (void *)(long)skb->data; 350 | struct hdr_cursor nh = { .pos = data }; 351 | int ip_type, ret = TC_ACT_OK; 352 | struct ipv6hdr *ipv6hdr = NULL; 353 | struct iphdr *iphdr = NULL; 354 | 355 | if (skb->protocol == bpf_htons(ETH_P_IP)) 356 | ip_type = parse_iphdr(&nh, data_end, &iphdr); 357 | else if (skb->protocol == bpf_htons(ETH_P_IPV6)) 358 | ip_type = parse_ip6hdr(&nh, data_end, &ipv6hdr); 359 | else 360 | goto out; 361 | 362 | if (ip_type == IPPROTO_TCP) 363 | return tcp_to_udp(skb, &nh, iphdr, ipv6hdr); 364 | if (ip_type == IPPROTO_UDP) 365 | udp_to_tcp(skb, &nh, iphdr, ipv6hdr); 366 | 367 | out: 368 | return ret; 369 | } 370 | 371 | char _license[] SEC("license") = "GPL"; 372 | --------------------------------------------------------------------------------