├── .gitignore ├── LICENSE ├── README.md ├── tracepkt.py └── tracepkt.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Jean-Tiare Le Bigot 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tracepkt 2 | 3 | Trace a ping packet on the L2 layer, as it crosses Linux network interfaces and namespaces. Supports IPv4 and IPv6. 4 | 5 | ```console 6 | > modprobe ip_tables ip6_tables 7 | > sudo python tracepkt.py 172.17.0.2 8 | NETWORK NS INTERFACE TYPE ADDRESSES IPTABLES 9 | [ 0] request 172.17.0.1 -> 172.17.0.2 nat.OUTPUT :ACCEPT 10 | [ 0] request 172.17.0.1 -> 172.17.0.2 filter.OUTPUT :ACCEPT 11 | [ 4026531992] docker0 request 172.17.0.1 -> 172.17.0.2 nat.POSTROUTING :ACCEPT 12 | [ 4026531992] docker0 request 172.17.0.1 -> 172.17.0.2 13 | [ 4026531992] veth61528aa request 172.17.0.1 -> 172.17.0.2 14 | [ 4026533448] eth0 request 172.17.0.1 -> 172.17.0.2 15 | [ 4026533448] eth0 reply 172.17.0.2 -> 172.17.0.1 16 | [ 4026531992] veth61528aa reply 172.17.0.2 -> 172.17.0.1 17 | [ 4026531992] docker0 reply 172.17.0.2 -> 172.17.0.1 18 | [ 4026531992] docker0 reply 172.17.0.2 -> 172.17.0.1 filter.INPUT :ACCEPT 19 | ``` 20 | 21 | The first 2 packets going from the current network namespace to a Docker container and going back, crossing a veth pair and a bridge. 22 | 23 | ## Features 24 | 25 | * Record crossed Linux network interfaces 26 | * Record crossed Linux network namespaces 27 | * Record crossed iptables chains 28 | 29 | **Note**: The iptables tracing support is ***experimental***. Tracing which specific 30 | rule matches within a chain is not supported. This would require to hook into 31 | ``trace_packet`` netfilter function which is unfortunatly ``static`` and 32 | therefore private. 33 | 34 | ## The full story 35 | 36 | This project started as an illustration for a blog post on perf and eBPF https://blog.yadutaf.fr/2017/07/28/tracing-a-packet-journey-using-linux-tracepoints-perf-ebpf/. 37 | 38 | ## Usage 39 | 40 | To use this project, you need a working / recent BCC install on your system. Read more about BCC on their Github repository: https://github.com/iovisor/bcc. 41 | 42 | Additionally, you'll need a recent kernel (presumably >= 4.7) and full root privilege. 43 | 44 | ## License 45 | 46 | MIT 47 | -------------------------------------------------------------------------------- /tracepkt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | from socket import inet_ntop, AF_INET, AF_INET6 6 | from bcc import BPF 7 | import ctypes as ct 8 | import subprocess 9 | from struct import pack 10 | 11 | IFNAMSIZ = 16 # uapi/linux/if.h 12 | XT_TABLE_MAXNAMELEN = 32 # uapi/linux/netfilter/x_tables.h 13 | 14 | # uapi/linux/netfilter.h 15 | NF_VERDICT_NAME = [ 16 | 'DROP', 17 | 'ACCEPT', 18 | 'STOLEN', 19 | 'QUEUE', 20 | 'REPEAT', 21 | 'STOP', 22 | ] 23 | 24 | # uapi/linux/netfilter.h 25 | # net/ipv4/netfilter/ip_tables.c 26 | HOOKNAMES = [ 27 | "PREROUTING", 28 | "INPUT", 29 | "FORWARD", 30 | "OUTPUT", 31 | "POSTROUTING", 32 | ] 33 | 34 | ROUTE_EVT_IF = 1 35 | ROUTE_EVT_IPTABLE = 2 36 | 37 | class TestEvt(ct.Structure): 38 | _fields_ = [ 39 | # Content flags 40 | ("flags", ct.c_ulonglong), 41 | 42 | # Routing information 43 | ("ifname", ct.c_char * IFNAMSIZ), 44 | ("netns", ct.c_ulonglong), 45 | 46 | # Packet type (IPv4 or IPv6) and address 47 | ("ip_version", ct.c_ulonglong), 48 | ("icmptype", ct.c_ulonglong), 49 | ("icmpid", ct.c_ulonglong), 50 | ("icmpseq", ct.c_ulonglong), 51 | ("saddr", ct.c_ulonglong * 2), 52 | ("daddr", ct.c_ulonglong * 2), 53 | 54 | # Iptables trace 55 | ("hook", ct.c_ulonglong), 56 | ("verdict", ct.c_ulonglong), 57 | ("tablename", ct.c_char * XT_TABLE_MAXNAMELEN), 58 | ] 59 | 60 | PING_PID="-1" 61 | 62 | def _get(l, index, default): 63 | ''' 64 | Get element at index in l or return the default 65 | ''' 66 | if index < len(l): 67 | return l[index] 68 | return default 69 | 70 | def event_printer(cpu, data, size): 71 | # Decode event 72 | event = ct.cast(data, ct.POINTER(TestEvt)).contents 73 | 74 | # Make sure this is an interface event 75 | if event.flags & ROUTE_EVT_IF != ROUTE_EVT_IF: 76 | return 77 | 78 | # Make sure it is OUR ping process 79 | if event.icmpid != PING_PID: 80 | return 81 | 82 | # Decode address 83 | if event.ip_version == 4: 84 | saddr = inet_ntop(AF_INET, pack("=I", event.saddr[0])) 85 | daddr = inet_ntop(AF_INET, pack("=I", event.daddr[0])) 86 | elif event.ip_version == 6: 87 | saddr = inet_ntop(AF_INET6, event.saddr) 88 | daddr = inet_ntop(AF_INET6, event.daddr) 89 | else: 90 | return 91 | 92 | # Decode direction 93 | if event.icmptype in [8, 128]: 94 | direction = "request" 95 | elif event.icmptype in [0, 129]: 96 | direction = "reply" 97 | else: 98 | return 99 | 100 | # Decode flow 101 | flow = "%s -> %s" % (saddr, daddr) 102 | 103 | # Optionally decode iptables events 104 | iptables = "" 105 | if event.flags & ROUTE_EVT_IPTABLE == ROUTE_EVT_IPTABLE: 106 | verdict = _get(NF_VERDICT_NAME, event.verdict, "~UNK~") 107 | hook = _get(HOOKNAMES, event.hook, "~UNK~") 108 | iptables = " %7s.%-12s:%s" % (event.tablename, hook, verdict) 109 | 110 | # Print event 111 | print "[%12s] %16s %7s %-34s%s" % (event.netns, event.ifname, direction, flow, iptables) 112 | 113 | if __name__ == "__main__": 114 | # Get arguments 115 | if len(sys.argv) == 1: 116 | TARGET = '127.0.0.1' 117 | elif len(sys.argv) == 2: 118 | TARGET = sys.argv[1] 119 | else: 120 | print "Usage: %s [TARGET_IP]" % (sys.argv[0]) 121 | sys.exit(1) 122 | 123 | # Build probe and open event buffer 124 | b = BPF(src_file='tracepkt.c') 125 | b["route_evt"].open_perf_buffer(event_printer) 126 | 127 | # Launch a background ping process 128 | with open('/dev/null', 'r') as devnull: 129 | ping = subprocess.Popen([ 130 | '/bin/ping', 131 | '-c1', 132 | TARGET, 133 | ], 134 | stdout=devnull, 135 | stderr=devnull, 136 | close_fds=True, 137 | ) 138 | PING_PID = ping.pid 139 | 140 | print "%14s %16s %7s %-34s %s" % ('NETWORK NS', 'INTERFACE', 'TYPE', 'ADDRESSES', 'IPTABLES') 141 | 142 | # Listen for event until the ping process has exited 143 | while ping.poll() is None: 144 | b.kprobe_poll(10) 145 | 146 | # Forward ping's exit code 147 | sys.exit(ping.poll()) 148 | -------------------------------------------------------------------------------- /tracepkt.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define ROUTE_EVT_IF 1 10 | #define ROUTE_EVT_IPTABLE 2 11 | 12 | // Event structure 13 | struct route_evt_t { 14 | /* Content flags */ 15 | u64 flags; 16 | 17 | /* Routing information */ 18 | char ifname[IFNAMSIZ]; 19 | u64 netns; 20 | 21 | /* Packet type (IPv4 or IPv6) and address */ 22 | u64 ip_version; // familiy (IPv4 or IPv6) 23 | u64 icmptype; 24 | u64 icmpid; // In practice, this is the PID of the ping process (see "ident" field in https://github.com/iputils/iputils/blob/master/ping_common.c) 25 | u64 icmpseq; // Sequence number 26 | u64 saddr[2]; // Source address. IPv4: store in saddr[0] 27 | u64 daddr[2]; // Dest address. IPv4: store in daddr[0] 28 | 29 | /* Iptables trace */ 30 | u64 hook; 31 | u64 verdict; 32 | char tablename[XT_TABLE_MAXNAMELEN]; 33 | }; 34 | BPF_PERF_OUTPUT(route_evt); 35 | 36 | // Arg stash structure 37 | struct ipt_do_table_args 38 | { 39 | struct sk_buff *skb; 40 | const struct nf_hook_state *state; 41 | struct xt_table *table; 42 | }; 43 | BPF_HASH(cur_ipt_do_table_args, u32, struct ipt_do_table_args); 44 | 45 | #define MAC_HEADER_SIZE 14; 46 | #define member_address(source_struct, source_member) \ 47 | ({ \ 48 | void* __ret; \ 49 | __ret = (void*) (((char*)source_struct) + offsetof(typeof(*source_struct), source_member)); \ 50 | __ret; \ 51 | }) 52 | #define member_read(destination, source_struct, source_member) \ 53 | do{ \ 54 | bpf_probe_read( \ 55 | destination, \ 56 | sizeof(source_struct->source_member), \ 57 | member_address(source_struct, source_member) \ 58 | ); \ 59 | } while(0) 60 | 61 | /** 62 | * Common tracepoint handler. Detect IPv4/IPv6 ICMP echo request and replies and 63 | * emit event with address, interface and namespace. 64 | */ 65 | static inline int do_trace_skb(struct route_evt_t *evt, void *ctx, struct sk_buff *skb) 66 | { 67 | // Prepare event for userland 68 | evt->flags |= ROUTE_EVT_IF; 69 | 70 | // Compute MAC header address 71 | char* head; 72 | u16 mac_header; 73 | u16 network_header; 74 | 75 | member_read(&head, skb, head); 76 | member_read(&mac_header, skb, mac_header); 77 | member_read(&network_header, skb, network_header); 78 | 79 | if(network_header == 0) { 80 | network_header = mac_header + MAC_HEADER_SIZE; 81 | } 82 | 83 | // Compute IP Header address 84 | char *ip_header_address = head + network_header; 85 | 86 | // Abstract IPv4 / IPv6 87 | u8 proto_icmp; 88 | u8 proto_icmp_echo_request; 89 | u8 proto_icmp_echo_reply; 90 | u8 icmp_offset_from_ip_header; 91 | u8 l4proto; 92 | 93 | // Load IP protocol version 94 | bpf_probe_read(&evt->ip_version, sizeof(u8), ip_header_address); 95 | evt->ip_version = evt->ip_version >> 4 & 0xf; 96 | 97 | // Filter IP packets 98 | if (evt->ip_version == 4) { 99 | // Load IP Header 100 | struct iphdr iphdr; 101 | bpf_probe_read(&iphdr, sizeof(iphdr), ip_header_address); 102 | 103 | // Load protocol and address 104 | icmp_offset_from_ip_header = iphdr.ihl * 4; 105 | l4proto = iphdr.protocol; 106 | evt->saddr[0] = iphdr.saddr; 107 | evt->daddr[0] = iphdr.daddr; 108 | 109 | // Load constants 110 | proto_icmp = IPPROTO_ICMP; 111 | proto_icmp_echo_request = ICMP_ECHO; 112 | proto_icmp_echo_reply = ICMP_ECHOREPLY; 113 | } else if (evt->ip_version == 6) { 114 | // Assume no option header --> fixed size header 115 | struct ipv6hdr* ipv6hdr = (struct ipv6hdr*)ip_header_address; 116 | icmp_offset_from_ip_header = sizeof(*ipv6hdr); 117 | 118 | // Load protocol and address 119 | bpf_probe_read(&l4proto, sizeof(ipv6hdr->nexthdr), (char*)ipv6hdr + offsetof(struct ipv6hdr, nexthdr)); 120 | bpf_probe_read(evt->saddr, sizeof(ipv6hdr->saddr), (char*)ipv6hdr + offsetof(struct ipv6hdr, saddr)); 121 | bpf_probe_read(evt->daddr, sizeof(ipv6hdr->daddr), (char*)ipv6hdr + offsetof(struct ipv6hdr, daddr)); 122 | 123 | // Load constants 124 | proto_icmp = IPPROTO_ICMPV6; 125 | proto_icmp_echo_request = ICMPV6_ECHO_REQUEST; 126 | proto_icmp_echo_reply = ICMPV6_ECHO_REPLY; 127 | } else { 128 | return 0; 129 | } 130 | 131 | // Filter ICMP packets 132 | if (l4proto != proto_icmp) { 133 | return 0; 134 | } 135 | 136 | // Compute ICMP header address and load ICMP header 137 | char* icmp_header_address = ip_header_address + icmp_offset_from_ip_header; 138 | struct icmphdr icmphdr; 139 | bpf_probe_read(&icmphdr, sizeof(icmphdr), icmp_header_address); 140 | 141 | // Filter ICMP echo request and echo reply 142 | if (icmphdr.type != proto_icmp_echo_request && icmphdr.type != proto_icmp_echo_reply) { 143 | return 0; 144 | } 145 | 146 | // Get ICMP info 147 | evt->icmptype = icmphdr.type; 148 | evt->icmpid = icmphdr.un.echo.id; 149 | evt->icmpseq = icmphdr.un.echo.sequence; 150 | 151 | // Fix endian 152 | evt->icmpid = be16_to_cpu(evt->icmpid); 153 | evt->icmpseq = be16_to_cpu(evt->icmpseq); 154 | 155 | // Get device pointer, we'll need it to get the name and network namespace 156 | struct net_device *dev; 157 | member_read(&dev, skb, dev); 158 | 159 | // Load interface name 160 | bpf_probe_read(&evt->ifname, IFNAMSIZ, dev->name); 161 | 162 | #ifdef CONFIG_NET_NS 163 | struct net* net; 164 | 165 | // Get netns id. The code below is equivalent to: evt->netns = dev->nd_net.net->ns.inum 166 | possible_net_t *skc_net = &dev->nd_net; 167 | member_read(&net, skc_net, net); 168 | struct ns_common* ns = member_address(net, ns); 169 | member_read(&evt->netns, ns, inum); 170 | #endif 171 | 172 | return 0; 173 | } 174 | 175 | static inline int do_trace(void *ctx, struct sk_buff *skb) 176 | { 177 | // Prepare event for userland 178 | struct route_evt_t evt = {}; 179 | 180 | // Process packet 181 | int ret = do_trace_skb(&evt, ctx, skb); 182 | 183 | // Send event 184 | route_evt.perf_submit(ctx, &evt, sizeof(evt)); 185 | 186 | // Return 187 | return ret; 188 | } 189 | 190 | /** 191 | * Attach to Kernel Interface Tracepoints 192 | */ 193 | 194 | TRACEPOINT_PROBE(net, netif_rx) 195 | { 196 | return do_trace(args, (struct sk_buff *)args->skbaddr); 197 | } 198 | 199 | TRACEPOINT_PROBE(net, net_dev_queue) 200 | { 201 | return do_trace(args, (struct sk_buff *)args->skbaddr); 202 | } 203 | 204 | TRACEPOINT_PROBE(net, napi_gro_receive_entry) 205 | { 206 | return do_trace(args, (struct sk_buff *)args->skbaddr); 207 | } 208 | 209 | TRACEPOINT_PROBE(net, netif_receive_skb_entry) 210 | { 211 | return do_trace(args, (struct sk_buff *)args->skbaddr); 212 | } 213 | 214 | /** 215 | * Common iptables functions 216 | */ 217 | 218 | static inline int __ipt_do_table_in(struct pt_regs *ctx, struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table) 219 | { 220 | u32 pid = bpf_get_current_pid_tgid(); 221 | 222 | // stash the arguments for use in retprobe 223 | struct ipt_do_table_args args = { 224 | .skb = skb, 225 | .state = state, 226 | .table = table, 227 | }; 228 | cur_ipt_do_table_args.update(&pid, &args); 229 | return 0; 230 | }; 231 | 232 | static inline int __ipt_do_table_out(struct pt_regs * ctx) 233 | { 234 | // Load arguments 235 | u32 pid = bpf_get_current_pid_tgid(); 236 | struct ipt_do_table_args *args; 237 | args = cur_ipt_do_table_args.lookup(&pid); 238 | if (args == 0) 239 | { 240 | return 0; // missed entry 241 | } 242 | cur_ipt_do_table_args.delete(&pid); 243 | 244 | // Prepare event for userland 245 | struct route_evt_t evt = { 246 | .flags = ROUTE_EVT_IPTABLE, 247 | }; 248 | 249 | // Load packet information 250 | struct sk_buff *skb = args->skb; 251 | do_trace_skb(&evt, ctx, skb); 252 | 253 | // Store the hook 254 | const struct nf_hook_state *state = args->state; 255 | member_read(&evt.hook, state, hook); 256 | 257 | // Store the table name 258 | struct xt_table *table = args->table; 259 | member_read(&evt.tablename, table, name); 260 | 261 | // Store the verdict 262 | int ret = PT_REGS_RC(ctx); 263 | evt.verdict = ret; 264 | 265 | // Send event 266 | route_evt.perf_submit(ctx, &evt, sizeof(evt)); 267 | 268 | return 0; 269 | } 270 | 271 | /** 272 | * Attach to Kernel iptables main function 273 | */ 274 | 275 | int kprobe__ipt_do_table(struct pt_regs *ctx, struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table) 276 | { 277 | return __ipt_do_table_in(ctx, skb, state, table); 278 | }; 279 | 280 | int kretprobe__ipt_do_table(struct pt_regs *ctx) 281 | { 282 | return __ipt_do_table_out(ctx); 283 | } 284 | 285 | int kprobe__ip6t_do_table(struct pt_regs *ctx, struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table) 286 | { 287 | return __ipt_do_table_in(ctx, skb, state, table); 288 | }; 289 | 290 | int kretprobe__ip6t_do_table(struct pt_regs *ctx) 291 | { 292 | return __ipt_do_table_out(ctx); 293 | } 294 | --------------------------------------------------------------------------------