├── bio-and-req-sizes.py ├── blk-and-read-sizes.py ├── blk-request-sizes.py ├── btrfs-create-issue ├── btrfs-csum-leak.py ├── btrfs-send-patches ├── cache-pressure ├── cache-pressure.sh ├── files-in-cache.sh ├── read-dir.c ├── read-file.c └── watch-alloc-inode.py ├── check.sh ├── codepaths.py ├── deadlock.py ├── dio-io-sizes.py ├── eio-stress.py ├── enospc-stress ├── Makefile ├── common.h ├── enospc-stress.bpf.c ├── enospc-stress.c ├── test.sh └── vmlinux.h ├── error-injection-stress.py ├── error.sh ├── find-missing-range.py ├── fuck-you-pagecache.py ├── inject-error.py ├── kernelparse ├── codepaths.py ├── kernelparse.py ├── printpaths.py ├── test-parse.py └── test.c ├── kswapd-work.py ├── mm-drgn-helpers.py ├── old-socket-debug.py ├── orphans.btrd ├── read-pattern.py ├── referenced-objects.py ├── rq-latency-dist.py ├── sched-time.py ├── snapshot-balance.sh ├── socket-debug.py ├── test-mmap-sync.c ├── test-parse.py ├── timing-everything.py ├── timing.py ├── unbalanced-reproducer ├── new-unbalanced.json ├── unbalanced-v1.sh └── unbalanced.sh ├── what-the-fuck-are-we-doing.py ├── xfs-get-blocks.py └── xfs-hang ├── inject-error.py ├── reproducer.sh ├── test.sh └── xfs-log-paths.txt /bio-and-req-sizes.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | import glob 3 | import os 4 | import re 5 | import time 6 | import argparse 7 | from time import sleep 8 | import signal 9 | import ctypes as ct 10 | 11 | debug = 0 12 | 13 | bpf_text = """ 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | typedef struct request_size_s { 22 | u64 size; 23 | u64 read; 24 | } request_size_t; 25 | 26 | BPF_PERF_OUTPUT(bio_events); 27 | BPF_PERF_OUTPUT(iter_events); 28 | BPF_PERF_OUTPUT(req_events); 29 | BPF_PERF_OUTPUT(split_events); 30 | 31 | // This sucks, but we have no better solution 32 | static dev_t get_devt(struct request *req) 33 | { 34 | struct gendisk *disk = req->rq_disk; 35 | return disk->part0.__dev.devt; 36 | } 37 | 38 | int trace_req_start(struct pt_regs *ctx, struct request *req) 39 | { 40 | dev_t device = get_devt(req); 41 | int major = MAJOR(device); 42 | int minor = MINOR(device); 43 | 44 | if (!(CONDITIONALS)) 45 | return 0; 46 | request_size_t data = { 47 | .size = req->__data_len, 48 | .read = !(req->cmd_flags & 1), 49 | }; 50 | req_events.perf_submit(ctx, &data, sizeof(data)); 51 | return 0; 52 | } 53 | 54 | int trace_bio_split(struct pt_regs *ctx, struct bio *bio, int nr_sectors) 55 | { 56 | dev_t device = bio->bi_bdev->bd_disk->part0.__dev.devt; 57 | int major = MAJOR(device); 58 | int minor = MINOR(device); 59 | 60 | if (!(CONDITIONALS)) 61 | return 0; 62 | request_size_t data = { 63 | .size = nr_sectors << 9, 64 | .read = !(bio->bi_opf & 1), 65 | }; 66 | split_events.perf_submit(ctx, &data, sizeof(data)); 67 | return 0; 68 | } 69 | 70 | int trace_submit_bio(struct pt_regs *ctx, struct bio *bio) 71 | { 72 | dev_t device = bio->bi_bdev->bd_disk->part0.__dev.devt; 73 | int major = MAJOR(device); 74 | int minor = MINOR(device); 75 | u64 count = bio->bi_iter.bi_size; 76 | 77 | if (!(CONDITIONALS)) 78 | return 0; 79 | request_size_t data = { 80 | .size = count, 81 | .read = !(bio->bi_opf & 1), 82 | }; 83 | bio_events.perf_submit(ctx, &data, sizeof(data)); 84 | return 0; 85 | } 86 | 87 | typedef struct bio_storage_s { 88 | struct bio *bio; 89 | } bio_storage_t; 90 | 91 | BPF_HASH(bios, u64, bio_storage_t); 92 | 93 | int trace_bio_iov_iter_get_pages(struct pt_regs *ctx, struct bio *bio) 94 | { 95 | u64 pid = bpf_get_current_pid_tgid(); 96 | bio_storage_t data = { 97 | .bio = bio, 98 | }; 99 | bios.update(&pid, &data); 100 | return 0; 101 | } 102 | 103 | int trace_bio_iov_iter_get_pages_ret(struct pt_regs *ctx) 104 | { 105 | u64 pid = bpf_get_current_pid_tgid(); 106 | bio_storage_t *data; 107 | 108 | data = bios.lookup(&pid); 109 | if (!data) 110 | return 0; 111 | 112 | u64 opf; 113 | request_size_t req = {}; 114 | bpf_probe_read(&req.size, sizeof(u64), &data->bio->bi_iter.bi_size); 115 | bpf_probe_read(&opf, sizeof(u64), &data->bio->bi_opf); 116 | req.read = !(opf & 1); 117 | iter_events.perf_submit(ctx, &req, sizeof(req)); 118 | bios.delete(&pid); 119 | return 0; 120 | } 121 | 122 | """ 123 | 124 | parser = argparse.ArgumentParser() 125 | parser.add_argument("-d", "--device", 126 | help="Trace this device only") 127 | args = parser.parse_args() 128 | 129 | disks = [] 130 | if args.device: 131 | disks.append({'name': os.path.basename(args.device)}) 132 | else: 133 | dev_patterns = ['sd.*', 'nvme.*', 'nbd.*', 'md.*', "fio*", "etherd*"] 134 | for device in glob.glob("/sys/block/*"): 135 | for pattern in dev_patterns: 136 | if re.compile(pattern).match(os.path.basename(device)): 137 | if pattern == "etherd*": 138 | disks.append({'name': os.path.basename(device).replace('!', '/')}) 139 | else: 140 | disks.append({'name': os.path.basename(device)}) 141 | if debug: 142 | print(disks) 143 | 144 | first = True 145 | conditional_template = "(major == MAJOR && minor == MINOR)" 146 | conditionals = "" 147 | for disk in disks: 148 | stinfo = os.stat('/dev/{}'.format(disk['name'])) 149 | disk['major'] = os.major(stinfo.st_rdev) 150 | disk['minor'] = os.minor(stinfo.st_rdev) 151 | tmp = conditional_template.replace('MAJOR', "{}".format(disk['major'])) 152 | tmp = tmp.replace('MINOR', "{}".format(disk['minor'])) 153 | if not first: 154 | conditionals += " || " 155 | first = False 156 | conditionals += tmp 157 | 158 | if conditionals == "": 159 | conditionals = "1" 160 | bpf_text = bpf_text.replace('CONDITIONALS', conditionals) 161 | 162 | # load BPF program 163 | b = BPF(text=bpf_text) 164 | b.attach_kprobe(event="submit_bio", fn_name="trace_submit_bio") 165 | b.attach_kprobe(event="bio_iov_iter_get_pages", fn_name="trace_bio_iov_iter_get_pages") 166 | b.attach_kretprobe(event="bio_iov_iter_get_pages", fn_name="trace_bio_iov_iter_get_pages_ret") 167 | b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 168 | b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 169 | b.attach_kprobe(event="bio_split", fn_name="trace_bio_split") 170 | 171 | class RequestSize(ct.Structure): 172 | _fields_ = [ 173 | ("size", ct.c_ulonglong), 174 | ("read", ct.c_ulonglong), 175 | ] 176 | 177 | def print_size(prestr, event): 178 | iostr = "write" 179 | if event.read == 1: 180 | iostr = "read" 181 | print("{} {}: {}".format(prestr, iostr, event.size)) 182 | 183 | def print_bio_size(cpu, data, size): 184 | event = ct.cast(data, ct.POINTER(RequestSize)).contents 185 | print_size("bio", event) 186 | 187 | def print_iter_size(cpu, data, size): 188 | event = ct.cast(data, ct.POINTER(RequestSize)).contents 189 | print_size("iter", event) 190 | 191 | def print_req_size(cpu, data, size): 192 | event = ct.cast(data, ct.POINTER(RequestSize)).contents 193 | print_size("req", event) 194 | 195 | def print_split_size(cpu, data, size): 196 | event = ct.cast(data, ct.POINTER(RequestSize)).contents 197 | print_size("split", event) 198 | 199 | b["bio_events"].open_perf_buffer(print_bio_size) 200 | b["iter_events"].open_perf_buffer(print_iter_size) 201 | b["req_events"].open_perf_buffer(print_req_size) 202 | b["split_events"].open_perf_buffer(print_split_size) 203 | while 1: 204 | b.kprobe_poll() 205 | -------------------------------------------------------------------------------- /blk-and-read-sizes.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | import glob 3 | import os 4 | import re 5 | import time 6 | import argparse 7 | from time import sleep 8 | import signal 9 | 10 | debug = 0 11 | 12 | def signal_ignore(signal, frame): 13 | print() 14 | 15 | class SignalInterrupt(Exception): 16 | def __init__(self, message): 17 | super(SignalInterrupt, self).__init__(message) 18 | 19 | def signal_stop(signal, frame): 20 | raise SignalInterrupt("Interrupted!") 21 | 22 | bpf_text = """ 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | typedef struct dev_key_s { 31 | u64 dev; 32 | u64 slot; 33 | } dev_key_t; 34 | 35 | BPF_HISTOGRAM(user_reads, dev_key_t); 36 | BPF_HISTOGRAM(user_writes, dev_key_t); 37 | BPF_HISTOGRAM(reads, dev_key_t); 38 | BPF_HISTOGRAM(writes, dev_key_t); 39 | BPF_HISTOGRAM(discards, dev_key_t); 40 | 41 | // This sucks, but we have no better solution 42 | static dev_t get_devt(struct request *req) 43 | { 44 | struct gendisk *disk = req->rq_disk; 45 | return disk->part0.__dev.devt; 46 | } 47 | 48 | // time block I/O 49 | int trace_req_start(struct pt_regs *ctx, struct request *req) 50 | { 51 | dev_t device = get_devt(req); 52 | int major = MAJOR(device); 53 | int minor = MINOR(device); 54 | 55 | if (req->__data_len == 0) 56 | return 0; 57 | 58 | if (!(CONDITIONALS)) 59 | return 0; 60 | 61 | dev_key_t key = { 62 | .dev = device, 63 | .slot = bpf_log2l(req->__data_len), 64 | }; 65 | 66 | if (req->cmd_flags & REQ_DISCARD) 67 | discards.increment(key); 68 | else if ((req->cmd_flags & 1) != 0) 69 | writes.increment(key); 70 | else 71 | reads.increment(key); 72 | return 0; 73 | } 74 | 75 | static int user_operation(struct kiocb *iocb, struct iov_iter *iov, int write) 76 | { 77 | dev_t device = iocb->ki_filp->f_mapping->host->i_sb->s_bdev->bd_disk->part0.__dev.devt; 78 | int major = MAJOR(device); 79 | int minor = MINOR(device); 80 | u64 count = iov->count; 81 | 82 | if (!(CONDITIONALS)) 83 | return 0; 84 | 85 | dev_key_t key = { 86 | .dev = device, 87 | .slot = bpf_log2l(count), 88 | }; 89 | 90 | if (write) 91 | user_writes.increment(key); 92 | else 93 | user_reads.increment(key); 94 | return 0; 95 | } 96 | 97 | int trace_generic_file_read_iter(struct pt_regs *ctx, struct kiocb *iocb, 98 | struct iov_iter *iov) 99 | { 100 | return user_operation(iocb, iov, 0); 101 | } 102 | 103 | int trace_generic_file_write_iter(struct pt_regs *ctx, struct kiocb *iocb, 104 | struct iov_iter *iov) 105 | { 106 | return user_operation(iocb, iov, 1); 107 | } 108 | """ 109 | 110 | parser = argparse.ArgumentParser() 111 | parser.add_argument("-d", "--device", 112 | help="Trace this device only") 113 | args = parser.parse_args() 114 | 115 | disks = [] 116 | if args.device: 117 | disks.append({'name': os.path.basename(args.device)}) 118 | else: 119 | dev_patterns = ['sd.*', 'nvme.*', 'nbd.*', 'md.*', "fio*", "etherd*"] 120 | for device in glob.glob("/sys/block/*"): 121 | for pattern in dev_patterns: 122 | if re.compile(pattern).match(os.path.basename(device)): 123 | if pattern == "etherd*": 124 | disks.append({'name': os.path.basename(device).replace('!', '/')}) 125 | else: 126 | disks.append({'name': os.path.basename(device)}) 127 | if debug: 128 | print(disks) 129 | 130 | first = True 131 | conditional_template = "(major == MAJOR && minor == MINOR)" 132 | conditionals = "" 133 | for disk in disks: 134 | stinfo = os.stat('/dev/{}'.format(disk['name'])) 135 | disk['major'] = os.major(stinfo.st_rdev) 136 | disk['minor'] = os.minor(stinfo.st_rdev) 137 | tmp = conditional_template.replace('MAJOR', "{}".format(disk['major'])) 138 | tmp = tmp.replace('MINOR', "{}".format(disk['minor'])) 139 | if not first: 140 | conditionals += " || " 141 | first = False 142 | conditionals += tmp 143 | 144 | if conditionals == "": 145 | conditionals = "1" 146 | bpf_text = bpf_text.replace('CONDITIONALS', conditionals) 147 | 148 | if debug: 149 | print(bpf_text) 150 | 151 | # load BPF program 152 | b = BPF(text=bpf_text) 153 | b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 154 | b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 155 | b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_generic_file_read_iter") 156 | b.attach_kprobe(event="generic_file_write_iter", fn_name="trace_generic_file_write_iter") 157 | 158 | reads = b.get_table("reads") 159 | writes = b.get_table("writes") 160 | discards= b.get_table("discards") 161 | user_reads = b.get_table("user_reads") 162 | user_writes = b.get_table("user_writes") 163 | 164 | print("Tracing, hit Ctrl+C to exit") 165 | signal.signal(signal.SIGINT, signal_stop) 166 | try: 167 | sleep(99999999) 168 | except SignalInterrupt: 169 | signal.signal(signal.SIGINT, signal_ignore) 170 | except KeyboardInterrupt: 171 | signal.signal(signal.SIGINT, signal_ignore) 172 | 173 | def print_device(dev): 174 | MINORBITS = 20 175 | MINORMASK = (1 << MINORBITS) - 1 176 | major = dev >> MINORBITS 177 | minor = dev & MINORMASK 178 | for disk in disks: 179 | if disk['major'] == major and disk['minor'] == minor: 180 | return disk['name'] 181 | return "%d-%d" % (major, minor) 182 | 183 | reads.print_log2_hist("Reads", "dev", section_print_fn=print_device) 184 | writes.print_log2_hist("Writes", "dev", section_print_fn=print_device) 185 | discards.print_log2_hist("Discards", "dev", section_print_fn=print_device) 186 | user_reads.print_log2_hist("User Reads", "dev", section_print_fn=print_device) 187 | user_writes.print_log2_hist("User Writes", "dev", section_print_fn=print_device) 188 | -------------------------------------------------------------------------------- /blk-request-sizes.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | import glob 3 | import os 4 | import re 5 | import time 6 | import argparse 7 | from time import sleep 8 | import signal 9 | 10 | debug = 0 11 | 12 | def signal_ignore(signal, frame): 13 | print() 14 | 15 | class SignalInterrupt(Exception): 16 | def __init__(self, message): 17 | super(SignalInterrupt, self).__init__(message) 18 | 19 | def signal_stop(signal, frame): 20 | raise SignalInterrupt("Interrupted!") 21 | 22 | bpf_text = """ 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | typedef struct dev_key_s { 30 | u64 dev; 31 | u64 slot; 32 | } dev_key_t; 33 | 34 | BPF_HISTOGRAM(reads, dev_key_t); 35 | BPF_HISTOGRAM(writes, dev_key_t); 36 | BPF_HISTOGRAM(discards, dev_key_t); 37 | 38 | // This sucks, but we have no better solution 39 | static dev_t get_devt(struct request *req) 40 | { 41 | struct gendisk *disk = req->rq_disk; 42 | return disk->part0.__dev.devt; 43 | } 44 | 45 | // time block I/O 46 | int trace_req_start(struct pt_regs *ctx, struct request *req) 47 | { 48 | dev_t device = get_devt(req); 49 | int major = MAJOR(device); 50 | int minor = MINOR(device); 51 | 52 | if (req->__data_len == 0) 53 | return 0; 54 | 55 | if (!(CONDITIONALS)) 56 | return 0; 57 | 58 | dev_key_t key = { 59 | .dev = device, 60 | .slot = bpf_log2l(req->__data_len), 61 | }; 62 | 63 | if (req->cmd_flags & REQ_DISCARD) 64 | discards.increment(key); 65 | else if ((req->cmd_flags & 1) != 0) 66 | writes.increment(key); 67 | else 68 | reads.increment(key); 69 | return 0; 70 | } 71 | """ 72 | 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument("-d", "--device", 75 | help="Trace this device only") 76 | args = parser.parse_args() 77 | 78 | disks = [] 79 | if args.device: 80 | disks.append({'name': os.path.basename(args.device)}) 81 | else: 82 | dev_patterns = ['sd.*', 'nvme.*', 'nbd.*', 'md.*', "fio*", "etherd*"] 83 | for device in glob.glob("/sys/block/*"): 84 | for pattern in dev_patterns: 85 | if re.compile(pattern).match(os.path.basename(device)): 86 | if pattern == "etherd*": 87 | disks.append({'name': os.path.basename(device).replace('!', '/')}) 88 | else: 89 | disks.append({'name': os.path.basename(device)}) 90 | if debug: 91 | print(disks) 92 | 93 | first = True 94 | conditional_template = "(major == MAJOR && minor == MINOR)" 95 | conditionals = "" 96 | for disk in disks: 97 | stinfo = os.stat('/dev/{}'.format(disk['name'])) 98 | disk['major'] = os.major(stinfo.st_rdev) 99 | disk['minor'] = os.minor(stinfo.st_rdev) 100 | tmp = conditional_template.replace('MAJOR', "{}".format(disk['major'])) 101 | tmp = tmp.replace('MINOR', "{}".format(disk['minor'])) 102 | if not first: 103 | conditionals += " || " 104 | first = False 105 | conditionals += tmp 106 | 107 | if conditionals == "": 108 | conditionals = "1" 109 | bpf_text = bpf_text.replace('CONDITIONALS', conditionals) 110 | 111 | if debug: 112 | print(bpf_text) 113 | 114 | # load BPF program 115 | b = BPF(text=bpf_text) 116 | b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 117 | b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 118 | 119 | reads = b.get_table("reads") 120 | writes = b.get_table("writes") 121 | discards= b.get_table("discards") 122 | 123 | print("Tracing, hit Ctrl+C to exit") 124 | signal.signal(signal.SIGINT, signal_stop) 125 | try: 126 | sleep(99999999) 127 | except SignalInterrupt: 128 | signal.signal(signal.SIGINT, signal_ignore) 129 | except KeyboardInterrupt: 130 | signal.signal(signal.SIGINT, signal_ignore) 131 | 132 | def print_device(dev): 133 | MINORBITS = 20 134 | MINORMASK = (1 << MINORBITS) - 1 135 | major = dev >> MINORBITS 136 | minor = dev & MINORMASK 137 | for disk in disks: 138 | if disk['major'] == major and disk['minor'] == minor: 139 | return disk['name'] 140 | return "%d-%d" % (major, minor) 141 | 142 | reads.print_log2_hist("Reads", "dev", section_print_fn=print_device) 143 | writes.print_log2_hist("Writes", "dev", section_print_fn=print_device) 144 | discards.print_log2_hist("Discards", "dev", section_print_fn=print_device) 145 | -------------------------------------------------------------------------------- /btrfs-create-issue: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | _exit() { 4 | echo $1 5 | exit 1 6 | } 7 | 8 | [ "$#" -ne 2 ] && _exit "btrfs-create-usage " 9 | 10 | gh --help > /dev/null 11 | [ $? -ne 0 ] && _exit "Please install gh to use this tool (https://cli.github.com/manual/installation)" 12 | 13 | MSG_ID=$1 14 | 15 | TEMPLATE="Link to patches 16 | 17 | https://lore.kernel.org/linux-btrfs/${MSG_ID}/ 18 | 19 | b4 am ${MSG_ID}" 20 | 21 | gh issue create --title "${2}" --project "Btrfs kernel patch review" \ 22 | -R "btrfs/linux" --body "${TEMPLATE}" 23 | -------------------------------------------------------------------------------- /btrfs-csum-leak.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | from time import sleep 3 | 4 | bpf_text = """ 5 | #include 6 | #include 7 | 8 | #define MAX_STACKS 8 9 | struct btrfs_inode; 10 | typedef struct action_s { 11 | u64 inode; 12 | u64 stackid; 13 | u64 num_bytes; 14 | u64 offset; 15 | u64 type; 16 | } action_t; 17 | 18 | typedef struct info_s { 19 | u64 inode; 20 | u64 stackid; 21 | u64 num_bytes; 22 | u64 offset; 23 | } info_t; 24 | 25 | BPF_HASH(infohash, u64, info_t); 26 | BPF_HASH(actions, action_t, u64, 100000); 27 | BPF_HASH(csums, u64, u64); 28 | BPF_STACK_TRACE(stack_traces, 10240); 29 | 30 | 31 | int trace_btrfs_reserve_metadata_bytes(struct pt_regs *ctx, 32 | struct btrfs_inode *inode, 33 | u64 offset, 34 | u64 num_bytes) 35 | { 36 | u64 pid = bpf_get_current_pid_tgid(); 37 | u64 bytes = ALIGN(num_bytes, 4096); 38 | info_t info = { 39 | .inode = (u64)inode, 40 | .stackid = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID), 41 | .num_bytes = bytes, 42 | .offset = offset, 43 | }; 44 | infohash.update(&pid, &info); 45 | return 0; 46 | } 47 | 48 | int trace_btrfs_reserve_metadata_bytes_ret(struct pt_regs *ctx) 49 | { 50 | u64 pid = bpf_get_current_pid_tgid(); 51 | u64 rc = PT_REGS_RC(ctx); 52 | if (rc != 0) 53 | return 0; 54 | info_t *info = infohash.lookup(&pid); 55 | if (!info) 56 | return 0; 57 | action_t action = { 58 | .stackid = info->stackid, 59 | .inode = info->inode, 60 | .num_bytes = info->num_bytes, 61 | .offset = info->offset, 62 | .type = (u64)1, 63 | }; 64 | u64 zero = 0; 65 | u64 bytes = info->num_bytes; 66 | u64 inode = info->inode; 67 | u64 *val = actions.lookup_or_init(&action, &zero); 68 | lock_xadd(val, 1); 69 | u64 *ival = csums.lookup_or_init(&inode, &zero); 70 | lock_xadd(ival, bytes); 71 | infohash.delete(&pid); 72 | return 0; 73 | } 74 | 75 | int trace_btrfs_delalloc_release_metadata(struct pt_regs *ctx, 76 | struct btrfs_inode *inode, 77 | u64 offset, 78 | u64 num_bytes) 79 | { 80 | u64 ino = (u64)inode; 81 | u64 bytes = ALIGN(num_bytes, 4096); 82 | action_t action = { 83 | .stackid = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID), 84 | .inode = ino, 85 | .num_bytes = bytes, 86 | .offset = offset, 87 | .type = (u64)0, 88 | }; 89 | u64 zero = 0; 90 | u64 *val = actions.lookup_or_init(&action, &zero); 91 | lock_xadd(val, 1); 92 | u64 *ival = csums.lookup(&ino); 93 | if (!ival) 94 | return 0; 95 | lock_xadd(ival, -bytes); 96 | return 0; 97 | } 98 | """ 99 | 100 | b = BPF(text=bpf_text) 101 | b.attach_kprobe(event="btrfs_delalloc_reserve_metadata", 102 | fn_name="trace_btrfs_reserve_metadata_bytes") 103 | b.attach_kretprobe(event="btrfs_delalloc_reserve_metadata", 104 | fn_name="trace_btrfs_reserve_metadata_bytes_ret") 105 | b.attach_kprobe(event="btrfs_delalloc_release_metadata", 106 | fn_name="trace_btrfs_delalloc_release_metadata") 107 | 108 | print("Tracing") 109 | try: 110 | sleep(1000000000) 111 | except KeyboardInterrupt: 112 | print("interrupted, dumping info") 113 | 114 | stack_traces = b.get_table("stack_traces") 115 | csums = b.get_table("csums") 116 | actions = b.get_table("actions") 117 | 118 | MAX_STACKS = 8 119 | for k,v in csums.items(): 120 | if v.value == 0: 121 | continue 122 | print("inode {} has {} bytes left over".format(k.value, v.value)) 123 | for action,num_entries in actions.items(): 124 | if action.inode != k.value: 125 | continue 126 | if action.type == 1: 127 | print("Get off={} bytes={} times={}".format(action.offset, 128 | action.num_bytes, 129 | num_entries.value)) 130 | else: 131 | print("Put off={} bytes={} times={}".format(action.offset, 132 | action.num_bytes, 133 | num_entries.value)) 134 | stack = stack_traces.walk(action.stackid) 135 | for addr in stack: 136 | print(" {}".format(b.ksym(addr, True, True))) 137 | print("\n") 138 | -------------------------------------------------------------------------------- /btrfs-send-patches: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | _exit() { 4 | echo $1 5 | exit 1 6 | } 7 | 8 | [ "$#" -ne 1 ] && _exit "Must specify a file or directory to send" 9 | 10 | MSG_ID="" 11 | EMAIL="" 12 | 13 | if [ -f $1 ] 14 | then 15 | EMAIL=$1 16 | elif [ -d $1 ] 17 | then 18 | EMAIL=$1/0000-cover-letter.patch 19 | [ -f $EMAIL ] || _exit "You must remember to use --cover-letter" 20 | else 21 | _exit "Must specify a file or directory to send" 22 | fi 23 | 24 | MSG_ID=$(grep 'Message-Id' ${EMAIL}) 25 | [ $? -ne 0 ] && _exit "Message-Id wasn't present in the patch provided" 26 | 27 | MSG_ID=$(echo ${MSG_ID} | \ 28 | python -c 'import re,sys; print(re.match("Message-Id: <(.*)>", sys.stdin.read())[1])') 29 | [ $? -ne 0 ] && _exit "Message-Id couldn't be extracted from the patch provided" 30 | 31 | # Just incase somebody includes 'Subject:' in their commit body 32 | SUBJECT=$(egrep '^Subject:' ${EMAIL} | head -n1 | cut -c 10-) 33 | [ $? -ne 0 ] && _exit "Subject wasn't present in the patch provided" 34 | 35 | git send-email $1 36 | ./btrfs-create-issue "${MSG_ID}" "${SUBJECT}" 37 | -------------------------------------------------------------------------------- /cache-pressure/cache-pressure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkfs.xfs -f /dev/nvme0n1 4 | mount -o noatime /dev/nvme0n1 /mnt/btrfs-test 5 | mkdir /mnt/btrfs-test/0 6 | mkdir /mnt/btrfs-test/1 7 | mkdir /mnt/btrfs-test/reads 8 | 9 | dd if=/dev/zero of=/mnt/btrfs-test/reads/file1 bs=1M count=6500 & 10 | dd if=/dev/zero of=/mnt/btrfs-test/reads/file2 bs=1M count=6500 & 11 | wait 12 | 13 | ./read-file /mnt/btrfs-test/reads/file1 & 14 | PID1=$! 15 | ./read-file /mnt/btrfs-test/reads/file2 & 16 | PID2=$! 17 | 18 | sleep 5 19 | ./fs_mark -D 5000 -S0 -n 100000 -s 0 -L 20 \ 20 | -d /mnt/btrfs-test/0 -d /mnt/btrfs-test/1 21 | /usr/bin/kill -SIGINT $PID1 $PID2 22 | wait -n $PID1 23 | wait -n $PID2 24 | grep SReclaimable /proc/meminfo 25 | umount /mnt/btrfs-test 26 | -------------------------------------------------------------------------------- /cache-pressure/files-in-cache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkfs.xfs -f /dev/nvme0n1 4 | mount -o noatime /dev/nvme0n1 /mnt/btrfs-test 5 | mkdir /mnt/btrfs-test/0 6 | mkdir /mnt/btrfs-test/1 7 | mkdir /mnt/btrfs-test/2 8 | mkdir /mnt/btrfs-test/reads 9 | 10 | ./fs_mark -n 1000000 -L 1 -s0 -d /mnt/btrfs-test/0 -d /mnt/btrfs-test/1 \ 11 | -d /mnt/btrfs-test/2 12 | grep xfs_inode /proc/slabinfo 13 | 14 | dd if=/dev/zero of=/mnt/btrfs-test/reads/file1 bs=1M seek=100000 count=1 15 | 16 | ./read-dir /mnt/btrfs-test/0 & 17 | PID1=$! 18 | ./read-dir /mnt/btrfs-test/1 & 19 | PID2=$! 20 | ./read-dir /mnt/btrfs-test/2 & 21 | PID3=$! 22 | python watch-alloc-inode.py $PID1 $PID2 $PID3 & 23 | PID4=$! 24 | 25 | cat /mnt/btrfs-test/reads/file1 > /dev/null 26 | /usr/bin/kill -SIGINT $PID1 $PID2 $PID3 27 | wait -n $PID1 $PID2 $PID3 28 | /usr/bin/kill -SIGINT $PID4 29 | wait -n $PID4 30 | grep xfs_inode /proc/slabinfo 31 | #/usr/bin/kill -SIGINT $BCCPID 32 | #wait 33 | #grep xfs_inode /proc/slabinfo 34 | umount /mnt/btrfs-test 35 | -------------------------------------------------------------------------------- /cache-pressure/read-dir.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | static int done = 0; 13 | 14 | void sig_handler(int signo) 15 | { 16 | done = 1; 17 | } 18 | 19 | int main(int argc, char **argv) 20 | { 21 | char *dirname; 22 | DIR *dir; 23 | char pathbuf[PATH_MAX]; 24 | unsigned long loops = 0; 25 | 26 | if (signal(SIGINT, sig_handler) == SIG_ERR) { 27 | fprintf(stderr, "Couldn't register signal handler\n"); 28 | exit(1); 29 | } 30 | 31 | if (argc != 2) { 32 | fprintf(stderr, "Please specify a file\n"); 33 | exit(1); 34 | } 35 | dirname = strdup(argv[1]); 36 | if (!dirname) { 37 | fprintf(stderr, "Couldn't allocate memory\n"); 38 | exit(1); 39 | } 40 | 41 | dir = opendir(dirname); 42 | if (!dir) { 43 | fprintf(stderr, "Couldn't open dir %s\n", dirname); 44 | exit(1); 45 | } 46 | 47 | while (!done) { 48 | struct dirent *dirent; 49 | struct stat st; 50 | 51 | errno = 0; 52 | dirent = readdir(dir); 53 | if (!dirent && errno == 0) { 54 | rewinddir(dir); 55 | loops++; 56 | continue; 57 | } else if (!dirent) { 58 | fprintf(stderr, "%s: failed to readdir\n", dirname); 59 | exit(1); 60 | } 61 | if (dirent->d_type != DT_REG) 62 | continue; 63 | snprintf(pathbuf, PATH_MAX, "%s/%s", dirname, dirent->d_name); 64 | if (stat(pathbuf, &st)) { 65 | fprintf(stderr, "%s: failed to stat\n", pathbuf); 66 | exit(1); 67 | } 68 | } 69 | printf("%s: total loops %lu\n", dirname, loops); 70 | return 0; 71 | } 72 | -------------------------------------------------------------------------------- /cache-pressure/read-file.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | static int done = 0; 11 | static size_t BUFSIZE = 1024 * 1024; 12 | 13 | void sig_handler(int signo) 14 | { 15 | done = 1; 16 | } 17 | 18 | static ssize_t full_read(int fd, char *buf, size_t size) 19 | { 20 | ssize_t ret = 0; 21 | 22 | while (ret < size) { 23 | ssize_t cur = read(fd, buf + ret, size - ret); 24 | if (cur <= 0) { 25 | if (!cur) 26 | return ret; 27 | return cur; 28 | } 29 | ret += cur; 30 | } 31 | return ret; 32 | } 33 | 34 | static unsigned long get_read_bytes(int fd, char *buf) 35 | { 36 | ssize_t ret = full_read(fd, buf, BUFSIZE); 37 | unsigned long read_bytes = 123; 38 | int nr; 39 | 40 | if (ret < 0) { 41 | fprintf(stderr, "Failed to read our iofd\n"); 42 | exit(1); 43 | } 44 | buf = strstr(buf, "read_bytes"); 45 | if (!buf) { 46 | fprintf(stderr, "There's no read_bytes entry?\n"); 47 | exit(1); 48 | } 49 | nr = sscanf(buf, "read_bytes: %lu\n", &read_bytes); 50 | if (nr != 1) { 51 | fprintf(stderr, "Couldn't find our read bytes, %d, %lu\n", nr, read_bytes); 52 | exit(1); 53 | } 54 | lseek(fd, 0, SEEK_SET); 55 | return read_bytes; 56 | } 57 | 58 | int main(int argc, char **argv) 59 | { 60 | char *iofile; 61 | char *filename; 62 | char *buf; 63 | int fd, iofd; 64 | unsigned long read_bytes = 0, loops = 0, total_read = 0; 65 | pid_t pid = getpid(); 66 | 67 | if (signal(SIGINT, sig_handler) == SIG_ERR) { 68 | fprintf(stderr, "Couldn't register signal handler\n"); 69 | exit(1); 70 | } 71 | 72 | if (argc != 2) { 73 | fprintf(stderr, "Please specify a file\n"); 74 | exit(1); 75 | } 76 | filename = strdup(argv[1]); 77 | if (!filename) { 78 | fprintf(stderr, "Couldn't allocate memory\n"); 79 | exit(1); 80 | } 81 | 82 | iofile = malloc(sizeof(char) * 64); 83 | if (!iofile) { 84 | fprintf(stderr, "Couldn't allocate a buffer for our iofile\n"); 85 | exit(1); 86 | } 87 | 88 | if (snprintf(iofile, 64, "/proc/%d/io", pid) < 0) { 89 | fprintf(stderr, "Couldn't make our iofile string\n"); 90 | exit(1); 91 | } 92 | 93 | iofd = open(iofile, O_RDONLY); 94 | if (iofd < 0) { 95 | fprintf(stderr, "Couldn't open our io file?\n"); 96 | exit(1); 97 | } 98 | 99 | fd = open(filename, O_RDONLY); 100 | if (fd < 0) { 101 | fprintf(stderr, "Couldn't open file\n"); 102 | exit(1); 103 | } 104 | 105 | buf = malloc(BUFSIZE); 106 | if (!buf) { 107 | fprintf(stderr, "Couldn't allocate my buffer\n"); 108 | exit(1); 109 | } 110 | 111 | read_bytes = get_read_bytes(iofd, buf); 112 | while (!done) { 113 | ssize_t bytes = full_read(fd, buf, BUFSIZE); 114 | if (bytes < 0) { 115 | fprintf(stderr, "Failed to read\n"); 116 | exit(1); 117 | } else if (!bytes) { 118 | unsigned long bytes = get_read_bytes(iofd, buf); 119 | if (bytes != read_bytes) 120 | printf("%s: loop %lu read bytes is %lu\n", 121 | filename, loops, bytes - read_bytes); 122 | total_read += bytes - read_bytes; 123 | read_bytes = bytes; 124 | lseek(fd, 0, SEEK_SET); 125 | loops++; 126 | } 127 | } 128 | printf("%s: total read during loops %lu\n", filename, total_read); 129 | return 0; 130 | } 131 | -------------------------------------------------------------------------------- /cache-pressure/watch-alloc-inode.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | from time import sleep 3 | import signal 4 | import argparse 5 | 6 | def signal_ignore(signal, frame): 7 | print() 8 | 9 | class SignalInterrupt(Exception): 10 | def __init__(self, message): 11 | super(SignalInterrupt, self).__init__(message) 12 | 13 | def signal_stop(signal, frame): 14 | raise SignalInterrupt("Interrupted!") 15 | 16 | bpf_text = """ 17 | #include 18 | #include 19 | 20 | BPF_HASH(alloc_count, u32); 21 | 22 | int trace_alloc_inode(struct pt_regs *ctx) 23 | { 24 | u32 pid = (u32)bpf_get_current_pid_tgid(); 25 | u64 zero = 0, *val; 26 | 27 | FILTER_PID 28 | val = alloc_count.lookup_or_init(&pid, &zero); 29 | (*val)++; 30 | return 0; 31 | } 32 | """ 33 | 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument('pids', metavar='PID', type=int, nargs='+', 36 | help='the pids to filter on') 37 | args = parser.parse_args() 38 | 39 | filters = [] 40 | for pid in args.pids: 41 | filters.append("pid != {}".format(pid)) 42 | filter_str = "if ({}) return 0;".format(" && ".join(filters)) 43 | 44 | bpf_text = bpf_text.replace('FILTER_PID', filter_str) 45 | b = BPF(text=bpf_text) 46 | b.attach_kprobe(event="alloc_inode", fn_name="trace_alloc_inode") 47 | 48 | signal.signal(signal.SIGINT, signal_stop) 49 | print("Tracing, hit Ctrl+C to exit") 50 | signal.signal(signal.SIGINT, signal_stop) 51 | try: 52 | sleep(99999999) 53 | except SignalInterrupt: 54 | signal.signal(signal.SIGINT, signal_ignore) 55 | except KeyboardInterrupt: 56 | signal.signal(signal.SIGINT, signal_ignore) 57 | 58 | alloc_count = b.get_table("alloc_count") 59 | count = 0 60 | for k,v in alloc_count.items(): 61 | count += v.value 62 | print("Total of {} inodes were allocated during the run".format(count)) 63 | -------------------------------------------------------------------------------- /check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dmesg | grep -E -q -e "kernel BUG at" \ 4 | -e "WARNING:" \ 5 | -e "\bBUG:" \ 6 | -e "Oops:" \ 7 | -e "possible recursive locking detected" \ 8 | -e "Internal error" \ 9 | -e "(INFO|ERR): suspicious RCU usage" \ 10 | -e "INFO: possible circular locking dependency detected" \ 11 | -e "general protection fault:" \ 12 | -e "BUG .* remaining" \ 13 | -e "UBSAN:" \ 14 | -e "leaked" 15 | 16 | [ "$?" -eq 0 ] && exit 1 17 | 18 | umount /mnt/test 19 | btrfs device scan --forget 20 | exit 0 21 | -------------------------------------------------------------------------------- /codepaths.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import sys 4 | import argparse 5 | 6 | class CodeTree: 7 | def __init__(self): 8 | self.root = None 9 | self.remaining = 0 10 | self.processed = 0 11 | self.avg = 0.0 12 | self.total = 0.0 13 | self.discovered = [] 14 | 15 | def contains(self, name): 16 | if name in self.discovered: 17 | return True 18 | self.discovered.append(name) 19 | return False 20 | 21 | def _print_path(self, node, path): 22 | path += " " + node.name 23 | ret = "" 24 | if len(node.children) == 0: 25 | return path + "\n" 26 | for i in node.children: 27 | ret += self._print_path(i, path) 28 | return ret 29 | 30 | def __str__(self): 31 | if self.root is None: 32 | return "" 33 | return self._print_path(self.root, "") 34 | 35 | def _print_leaves(self, node): 36 | ret = "" 37 | if len(node.children) == 0: 38 | return node.name + "\n" 39 | for i in node.children: 40 | ret += self._print_leaves(i) 41 | return ret 42 | 43 | def _find_all_paths(self, node, path): 44 | path = path + [node.name] 45 | if len(node.children) == 0: 46 | return [path] 47 | paths = [] 48 | for n in node.children: 49 | newpaths = self._find_all_paths(n, path) 50 | for newpath in newpaths: 51 | paths.append(newpath) 52 | return paths 53 | 54 | def paths(self): 55 | if self.root is None: 56 | return [] 57 | return self._find_all_paths(self.root, []) 58 | 59 | def leaves(self): 60 | if self.root is None: 61 | return "" 62 | return self._print_leaves(self.root) 63 | 64 | class CodeNode: 65 | def __init__(self, name): 66 | self.children = [] 67 | self.name = name 68 | 69 | def add_child(self, child): 70 | self.children.append(child) 71 | 72 | def find_callers(func, cscopedb): 73 | p = subprocess.Popen(["cscope", "-d", "-f", cscopedb, "-L3", func], 74 | stdout=subprocess.PIPE) 75 | (output, error) = p.communicate() 76 | output = output.rstrip() 77 | ret = [] 78 | for l in output.split('\n'): 79 | ret.append(l.split(' ')[:2]) 80 | return ret 81 | 82 | def get_paths(tree, node, cscopedb, directories, exclude, dupes): 83 | tree.processed += 1 84 | t0 = time.time() 85 | callers = find_callers(node.name, cscopedb) 86 | tree.total += time.time() - t0 87 | tree.processed += 1 88 | tree.remaining -= 1 89 | tree.remaining += len(callers) 90 | avg = tree.total / tree.processed 91 | remain = tree.remaining * (tree.total / tree.processed) 92 | sys.stderr.write("\r{} elapsed, {} possible remaining".format(tree.total, remain)) 93 | sys.stderr.flush() 94 | for c in callers: 95 | skip = True 96 | for i in directories: 97 | if i in c[0]: 98 | skip = False 99 | break 100 | if skip: 101 | tree.remaining -= 1 102 | continue 103 | 104 | for i in exclude: 105 | if i in c[0]: 106 | skip = True 107 | break 108 | if skip: 109 | tree.remaining -= 1 110 | continue 111 | 112 | if not dupes and tree.contains(c[1]): 113 | tree.remaining -= 1 114 | continue 115 | 116 | child = CodeNode(c[1]) 117 | node.add_child(child) 118 | tree.processed += 1 119 | get_paths(tree, child, cscopedb, directories, exclude, dupes) 120 | 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument("-c", "--cscopedb", default="cscope.out", help="Location of cscope.out") 123 | parser.add_argument("-e", "--exclude", action='append', 124 | help="Exclude this component of the path") 125 | parser.add_argument("-d", "--directory", action='append', 126 | help="Only deal with functions in this directory (can just be one part of the path)") 127 | parser.add_argument("-p", "--duplicates", action='store_true', 128 | help="Don't filter out duplicate leaves (ie have a->b->c as well as a->d->c)") 129 | parser.add_argument("-t", "--tree", action='store_true', 130 | help="Print all of the paths of the whole tree") 131 | parser.add_argument("function", help="Function to build the code paths from") 132 | 133 | args = parser.parse_args() 134 | 135 | exclude = [] 136 | directories = [] 137 | 138 | if args.directory is not None: 139 | directories = args.directory 140 | if args.exclude is not None: 141 | exclude = args.exclude 142 | 143 | tree = CodeTree() 144 | node = CodeNode(args.function) 145 | tree.root = node 146 | get_paths(tree, node, args.cscopedb, directories, exclude, args.duplicates) 147 | 148 | sys.stderr.write("\nProccessed {} functions in {} seconds\n".format(tree.processed, tree.total)) 149 | sys.stderr.flush() 150 | 151 | leaves = tree.paths() 152 | lsorted = sorted(leaves, key=lambda x:len(x)) 153 | 154 | if args.tree: 155 | print(tree) 156 | else: 157 | for i in lsorted: 158 | print(i[-1]) 159 | -------------------------------------------------------------------------------- /deadlock.py: -------------------------------------------------------------------------------- 1 | for i in range(0, len(prog['page_wait_table'])): 2 | wait_t = prog['page_wait_table'][i] 3 | if not list_empty(wait_t.head.address_of_()): 4 | for entry in list_for_each_entry('wait_queue_entry_t', wait_t.head.address_of_(), 'entry'): 5 | page_entry = container_of(entry, 'struct wait_page_queue', 'wait') 6 | task = cast("struct task_struct *", entry.private) 7 | print("page {} mapping {} index {} bit {} pid {} flags {}".format( 8 | hex(page_entry.page.value_()), 9 | hex(page_entry.page.mapping.value_()), 10 | page_entry.page.index, page_entry.bit_nr, task.pid, 11 | page_bits(page_entry.page))) 12 | # find_compressed_bio(page_entry.page.mapping.host) 13 | for cb in cb_objs: 14 | if cb.inode == page_entry.page.mapping.host: 15 | if cb.start == page_entry.page.index << 12: 16 | print(cb) 17 | find_bio_page(cb.compressed_pages[0]) 18 | find_inode_mapping(page_entry.page.mapping.host, page_entry.page.index << 12) 19 | find_inode_ordered_extent(page_entry.page.mapping.host, page_entry.page.index << 12) 20 | 21 | def find_bio_page(page): 22 | for b in objs: 23 | if b.bio.bi_vcnt < 0 or b.bio.bi_vcnt > 100: 24 | continue 25 | for bvec in bio_for_each_bvec(prog, b.bio): 26 | try: 27 | if bvec.bv_page == page: 28 | return b 29 | except FaultError: 30 | break 31 | 32 | def find_bio_private(value): 33 | for b in objs: 34 | if b.bio.bi_private.value_() == value: 35 | print(b) 36 | 37 | def find_bio_sector(value): 38 | for b in objs: 39 | if b.bio.bi_iter.bi_sector.value_() == value: 40 | print(b) 41 | 42 | def find_compressed_bio(inode): 43 | for b in objs: 44 | if b.bio.bi_vcnt < 0 or b.bio.bi_vcnt > 100: 45 | continue 46 | if b.bio.bi_end_io.value_() != prog['end_compressed_bio_write'].address_of_().value_(): 47 | continue 48 | try: 49 | cb = cast("struct compressed_bio *", b.bio.bi_private) 50 | if cb.inode == inode: 51 | print(cb) 52 | print(b) 53 | except FaultError: 54 | break 55 | 56 | def find_inode_ordered_extent(inode, offset): 57 | btrfs_inode = container_of(inode, 'struct btrfs_inode', 'vfs_inode') 58 | for ordered in rbtree_inorder_for_each_entry('struct btrfs_ordered_extent', 59 | btrfs_inode.ordered_tree.tree, 60 | 'rb_node'): 61 | if ordered.file_offset <= offset and ordered.file_offset + ordered.num_bytes > offset: 62 | print(ordered) 63 | 64 | 65 | def find_inode_mapping(inode, offset): 66 | btrfs_inode = container_of(inode, 'struct btrfs_inode', 'vfs_inode') 67 | for em in rbtree_inorder_for_each_entry('struct extent_map', 68 | btrfs_inode.extent_tree.map.rb_root, 69 | 'rb_node'): 70 | if em.start <= offset and em.start + em.len > offset: 71 | print(f'{em.start} {em.block_start} {em.block_len}') 72 | 73 | def dump_inode_ordered_extents(inode): 74 | btrfs_inode = container_of(inode, 'struct btrfs_inode', 'vfs_inode') 75 | for ordered in rbtree_inorder_for_each_entry('struct btrfs_ordered_extent', 76 | btrfs_inode.ordered_tree.tree, 77 | 'rb_node'): 78 | print(ordered) 79 | 80 | def dump_inode_extent_map(inode): 81 | btrfs_inode = container_of(inode, 'struct btrfs_inode', 'vfs_inode') 82 | for em in rbtree_inorder_for_each_entry('struct extent_map', 83 | btrfs_inode.extent_tree.map.rb_root, 84 | 'rb_node'): 85 | print(f'{em.start} {em.block_start} {em.block_len}') 86 | 87 | for b in objs: 88 | if b.bio.bi_vcnt < 0 or b.bio.bi_vcnt > 300: 89 | continue 90 | if b.bio.bi_end_io.value_() != prog['end_compressed_bio_write'].address_of_().value_(): 91 | continue 92 | try: 93 | cb = cast("struct compressed_bio *", b.bio.bi_private) 94 | print(cb.start) 95 | except FaultError: 96 | break 97 | 98 | our_dip = None 99 | for d in dips: 100 | for bio in dio_bios: 101 | if bio.address_of_().value_() == d.dio_bio.value_(): 102 | our_dip = d 103 | break 104 | 105 | def find_wqs(wqs, endio): 106 | ret = [] 107 | for w in wqs: 108 | if w.end_io.value_() == prog[endio].address_of_().value_(): 109 | ret.append(w) 110 | return ret 111 | 112 | def find_btrfs_bios(bios, endio): 113 | ret = [] 114 | for b in bios: 115 | if b.bio.bi_vcnt < 0 or b.bio.bi_vcnt > 300: 116 | continue 117 | if b.bio.bi_end_io.value_() != prog[endio].address_of_().value_(): 118 | if b.bio.bi_end_io.value_() != prog['btrfs_end_bio'].address_of_().value_(): 119 | print(b.bio.bi_end_io) 120 | continue 121 | bbio = cast('struct btrfs_bio *', b.bio.bi_private) 122 | if bbio.end_io.value_() != prog[endio].address_of_().value_(): 123 | if bbio.end_io.value_() != prog['end_workqueue_bio'].address_of_().value_(): 124 | # print(bbio.end_io) 125 | continue 126 | end_io_wq = cast('struct btrfs_end_io_wq', bbio.private) 127 | if end_io_wq.end_io.value_() != prog[endio].address_of_().value_(): 128 | print(end_io_wq.end_io) 129 | continue 130 | print(bbio.end_io) 131 | continue 132 | print("HOOOORAAAYYY") 133 | ret.append(b) 134 | return ret 135 | 136 | objs = dump_slab_objects(prog, prog['btrfs_bioset'].bio_slab, 'struct btrfs_io_bio') 137 | 138 | def find_dio_bios(bios): 139 | ret = [] 140 | for b in bios: 141 | if b.bi_vcnt < 0 or b.bi_vcnt > 300: 142 | continue 143 | if b.bi_end_io.value_() != prog['dio_bio_end_io'].address_of_().value_(): 144 | continue 145 | ret.append(b) 146 | return ret 147 | 148 | bios = dump_slab_objects(prog, prog['fs_bio_set'].bio_slab, 'struct bio') 149 | 150 | for i in range(0, len(prog['page_wait_table'])): 151 | wait_t = prog['page_wait_table'][i] 152 | if not list_empty(wait_t.head.address_of_()): 153 | for entry in list_for_each_entry('wait_queue_entry_t', wait_t.head.address_of_(), 'entry'): 154 | page_entry = container_of(entry, 'struct wait_page_queue', 'wait') 155 | task = cast("struct task_struct *", entry.private) 156 | bio = None 157 | for b in objs: 158 | if b.bio.bi_vcnt < 0 or b.bio.bi_vcnt > 100: 159 | continue 160 | for bvec in bio_for_each_bvec(prog, b.bio): 161 | try: 162 | if bvec.bv_page == page_entry.page: 163 | bio = b 164 | break 165 | except FaultError: 166 | break 167 | if bio: 168 | break 169 | print("page {} mapping {} index {} bit {} pid {} flags {}".format( 170 | hex(page_entry.page.value_()), 171 | hex(page_entry.page.mapping.value_()), 172 | page_entry.page.index, page_entry.bit_nr, task.pid, 173 | page_bits(page_entry.page))) 174 | print(bio) 175 | 176 | def page_bits(page): 177 | ret = "" 178 | for name,value in prog.type('enum pageflags').enumerators: 179 | bit = 1 << value 180 | if (bit & page.flags): 181 | if ret == "": 182 | ret += name 183 | else: 184 | ret += "|{}".format(name) 185 | return ret 186 | 187 | def dump_locked_page_waiters(prog): 188 | for i in range(0, len(prog['page_wait_table'])): 189 | wait_t = prog['page_wait_table'][i] 190 | if not list_empty(wait_t.head.address_of_()): 191 | for entry in list_for_each_entry('wait_queue_entry_t', wait_t.head.address_of_(), 'entry'): 192 | page_entry = container_of(entry, 'struct wait_page_queue', 'wait') 193 | task = cast("struct task_struct *", entry.private) 194 | print("page {} mapping {} index {} bit {} pid {} flags {}".format( 195 | hex(page_entry.page.value_()), 196 | hex(page_entry.page.mapping.value_()), 197 | page_entry.page.index, page_entry.bit_nr, task.pid, 198 | page_bits(page_entry.page))) 199 | 200 | for t in for_each_task(prog): 201 | if t.state.value_() == 2: 202 | trace = prog.stack_trace(t) 203 | if len(trace) >= 3: 204 | if (trace[0].symbol().name == "__schedule" and 205 | "rwsem_down" in trace[2].symbol().name): 206 | continue 207 | if len(trace) > 4: 208 | if ("__mutex_lock" in trace[3].symbol().name and 209 | trace[4].symbol().name == "btrfs_start_delalloc_roots"): 210 | continue 211 | print("task {} is stuck".format(t.pid)) 212 | prog.stack_trace(t) 213 | 214 | for t in for_each_task(prog): 215 | if t.state.value_() == 2: 216 | trace = prog.stack_trace(t) 217 | if len(trace) >= 3: 218 | if (trace[0].symbol().name == "__schedule" and 219 | "rwsem_down" in trace[2].symbol().name): 220 | continue 221 | if len(trace) > 4: 222 | if ("mutex_lock" in trace[3].symbol().name and 223 | trace[4].symbol().name == "btrfs_start_delalloc_roots"): 224 | continue 225 | print("task {} is stuck".format(t.pid)) 226 | prog.stack_trace(t) 227 | print("") 228 | 229 | def btrfs_for_each_root(fs_info): 230 | for objectid,root_ptr in radix_tree_for_each(fs_info.fs_roots_radix.address_of_()): 231 | root = cast('struct btrfs_root *', root_ptr) 232 | yield root 233 | 234 | for root in btrfs_for_each_root(fs_info): 235 | flag = 1 << prog['BTRFS_ROOT_DEAD_RELOC_TREE'].value_() 236 | if root.state & flag: 237 | print("root {} has it set".format(root.root_key.objectid)) 238 | 239 | def btrfs_dump_live_inodes(root): 240 | for inode in rbtree_inorder_for_each_entry('struct btrfs_inode', 241 | root.inode_tree, 'rb_node'): 242 | print(f"{inode.location.objectid} {inode.vfs_inode.i_count.counter} {inode.vfs_inode.i_state}") 243 | 244 | def btrfs_get_root(fs_info, root_id): 245 | return cast('struct btrfs_root *', 246 | radix_tree_lookup(fs_info.fs_roots_radix.address_of_(), root_id)) 247 | 248 | def btrfs_get_fs_info(prog, path): 249 | mnt = None 250 | for m in for_each_mount(prog): 251 | if (mount_dst(m).decode('ascii') == path and 252 | mount_fstype(m).decode('ascii') == "btrfs"): 253 | mnt = m 254 | break 255 | if not mnt: 256 | return None 257 | return cast("struct btrfs_fs_info *", mnt.mnt.mnt_sb.s_fs_info) 258 | 259 | def btrfs_dump_fs_infos(prog): 260 | for m in for_each_mount(prog): 261 | if mount_fstype(m) == b'btrfs': 262 | print("{} {}".format(mount_dst(m), 263 | hex(m.mnt.mnt_sb.s_fs_info.value_()))) 264 | 265 | for b in objs: 266 | if b.fs_info == fs_info: 267 | print(b) 268 | -------------------------------------------------------------------------------- /dio-io-sizes.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from bcc import BPF 3 | import ctypes as ct 4 | 5 | bpf_text = """ 6 | #include 7 | #include 8 | #include 9 | 10 | typedef struct block_data_s { 11 | struct buffer_head *map_bh; 12 | u64 b_orig_size; 13 | u64 b_found_size; 14 | u64 b_state; 15 | } block_data_t; 16 | 17 | typedef struct read_data_s { 18 | u64 count; 19 | u64 b_orig_size; 20 | u64 b_found_size; 21 | u64 b_state; 22 | } read_data_t; 23 | 24 | typedef struct event_data_s { 25 | u64 pid; 26 | u64 time; 27 | char op[16]; 28 | } event_data_t; 29 | 30 | BPF_HASH(buffers, u64, block_data_t); 31 | BPF_HASH(reads, u64, read_data_t); 32 | BPF_HASH(read_traces, u64); 33 | BPF_STACK_TRACE(stack_traces, 1024); 34 | BPF_PERF_OUTPUT(read_events); 35 | BPF_PERF_OUTPUT(block_events); 36 | BPF_PERF_OUTPUT(events); 37 | 38 | int trace_get_blocks(struct pt_regs *ctx, struct inode *inode, 39 | sector_t block, struct buffer_head *map_bh, 40 | int create) 41 | { 42 | if (create) 43 | return 0; 44 | u64 pid = bpf_get_current_pid_tgid(); 45 | buffers.delete(&pid); 46 | 47 | block_data_t key = { 48 | .map_bh = map_bh, 49 | .b_orig_size = map_bh->b_size, 50 | }; 51 | buffers.update(&pid, &key); 52 | 53 | pid = bpf_get_current_pid_tgid(); 54 | read_data_t *data = reads.lookup(&pid); 55 | if (!data) { 56 | event_data_t edata = { 57 | .op = "get_blocks_miss", 58 | .pid = pid, 59 | .time = bpf_ktime_get_ns(), 60 | }; 61 | events.perf_submit(ctx, &edata, sizeof(edata)); 62 | return 0; 63 | } 64 | /* 65 | event_data_t edata = { 66 | .op = "get_blocks_hit", 67 | .pid = pid, 68 | .time = bpf_ktime_get_ns(), 69 | }; 70 | events.perf_submit(ctx, &edata, sizeof(edata)); 71 | */ 72 | if (data->count == map_bh->b_size) 73 | return 0; 74 | 75 | read_data_t out = { 76 | .count = data->count, 77 | .b_orig_size = map_bh->b_size, 78 | .b_found_size = 0, 79 | }; 80 | block_events.perf_submit(ctx, &out, sizeof(out)); 81 | 82 | return 0; 83 | } 84 | 85 | int trace_exit_get_blocks(struct pt_regs *ctx) 86 | { 87 | u64 pid = bpf_get_current_pid_tgid(); 88 | block_data_t *data; 89 | 90 | data = buffers.lookup(&pid); 91 | if (!data) 92 | return 0; 93 | 94 | u64 size,state; 95 | 96 | // the rewriter doesn't recognize this as needing a probe read, so do 97 | // it ourselves 98 | bpf_probe_read(&size, sizeof(u64), &data->map_bh->b_size); 99 | bpf_probe_read(&state, sizeof(u64), &data->map_bh->b_state); 100 | 101 | data->b_found_size = size; 102 | data->b_state = state; 103 | 104 | if (data->b_found_size != data->b_orig_size) { 105 | read_data_t out = { 106 | .count = 0, 107 | .b_orig_size = data->b_orig_size, 108 | .b_found_size = data->b_found_size, 109 | }; 110 | block_events.perf_submit(ctx, &out, sizeof(out)); 111 | } 112 | return 0; 113 | } 114 | 115 | int trace_vfs_read(struct pt_regs *ctx, struct file *file, char *buf, size_t count) 116 | { 117 | u64 magic = file->f_mapping->host->i_sb->s_magic; 118 | if (magic != 0x58465342) 119 | return 0; 120 | read_data_t data = { 121 | .count = count, 122 | }; 123 | u64 pid = bpf_get_current_pid_tgid(); 124 | reads.update(&pid, &data); 125 | /* 126 | event_data_t edata = { 127 | .op = "read", 128 | .pid = pid, 129 | .time = bpf_ktime_get_ns(), 130 | }; 131 | events.perf_submit(ctx, &edata, sizeof(edata)); 132 | */ 133 | return 0; 134 | } 135 | 136 | int trace_vfs_read_ret(struct pt_regs *ctx) 137 | { 138 | u64 pid = bpf_get_current_pid_tgid(); 139 | read_data_t *data = reads.lookup(&pid); 140 | if (!data) 141 | return 0; 142 | reads.delete(&pid); 143 | /* 144 | event_data_t edata = { 145 | .op = "read exit", 146 | .pid = pid, 147 | .time = bpf_ktime_get_ns(), 148 | }; 149 | events.perf_submit(ctx, &edata, sizeof(edata)); 150 | */ 151 | return 0; 152 | } 153 | 154 | int trace_submit_bio(struct pt_regs *ctx, int rw, struct bio *bio) 155 | { 156 | if ((rw & 1) == 1) 157 | return 0; 158 | if (bio->bi_iter.bi_size != 4096) 159 | return 0; 160 | u64 pid = bpf_get_current_pid_tgid(); 161 | read_data_t *data = reads.lookup(&pid); 162 | if (!data) 163 | return 0; 164 | block_data_t *bdata = buffers.lookup(&pid); 165 | if (!bdata) 166 | return 0; 167 | u64 stackid = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); 168 | data->b_orig_size = bdata->b_orig_size; 169 | data->b_found_size = bdata->b_found_size; 170 | data->b_state = bdata->b_state; 171 | read_traces.update(&pid, &stackid); 172 | read_events.perf_submit(ctx, data, sizeof(read_data_t)); 173 | return 0; 174 | } 175 | """ 176 | 177 | b = BPF(text=bpf_text) 178 | b.attach_kprobe(event="xfs_get_blocks_direct", fn_name="trace_get_blocks") 179 | b.attach_kretprobe(event="xfs_get_blocks_direct", fn_name="trace_exit_get_blocks") 180 | b.attach_kprobe(event="vfs_read", fn_name="trace_vfs_read") 181 | b.attach_kretprobe(event="vfs_read", fn_name="trace_vfs_read_ret") 182 | b.attach_kprobe(event="submit_bio", fn_name="trace_submit_bio") 183 | 184 | class ReadData(ct.Structure): 185 | _fields_ = [ 186 | ("count", ct.c_ulonglong), 187 | ("b_orig_size", ct.c_ulonglong), 188 | ("b_found_size", ct.c_ulonglong), 189 | ] 190 | 191 | class EventData(ct.Structure): 192 | _fields_ = [ 193 | ("pid", ct.c_ulonglong), 194 | ("time", ct.c_ulonglong), 195 | ("op", ct.c_char * 16), 196 | ] 197 | def print_data(cpu, data, size): 198 | event = ct.cast(data, ct.POINTER(ReadData)).contents 199 | print("wrong bio size for read %s, map wanted size %s, map found size %s" % (event.count, event.b_orig_size, event.b_found_size)) 200 | 201 | def print_block(cpu, data, size): 202 | event = ct.cast(data, ct.POINTER(ReadData)).contents 203 | print("wrong map size for read %s, map wanted size %s, map found size %s" % (event.count, event.b_orig_size, event.b_found_size)) 204 | 205 | def print_events(cpu, data, size): 206 | event = ct.cast(data, ct.POINTER(EventData)).contents 207 | print("%s op %s pid %s" % (event.time, event.op, event.pid)) 208 | 209 | b["read_events"].open_perf_buffer(print_data) 210 | b["block_events"].open_perf_buffer(print_block) 211 | b["events"].open_perf_buffer(print_events) 212 | read_traces = b.get_table("read_traces") 213 | stack_traces = b.get_table("stack_traces") 214 | 215 | while 1: 216 | b.kprobe_poll() 217 | # for k,v in read_traces.items(): 218 | # stack = stack_traces.walk(v.value) 219 | # print("Pid %d" % (k.value)) 220 | # for addr in stack: 221 | # print(" %s" % b.ksym(addr)) 222 | # print("\n") 223 | read_traces.clear() 224 | -------------------------------------------------------------------------------- /eio-stress.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | from time import sleep 3 | from subprocess import Popen 4 | import argparse 5 | import sys 6 | import os 7 | import ctypes as ct 8 | 9 | bpf_text = """ 10 | #include 11 | #include 12 | #include 13 | 14 | BPF_HASH(seen, u64); 15 | BPF_ARRAY(enabled, u64, 1); 16 | BPF_PERF_OUTPUT(events); 17 | BPF_STACK_TRACE(stack_traces, 10240); 18 | 19 | int override_function(struct pt_regs *ctx, struct bio *bio) 20 | { 21 | unsigned long rc = RCVAL; 22 | 23 | if (bio->bi_bdev->bd_disk->major != MAJOR || bio->bi_bdev->bd_disk->first_minor != MINOR) 24 | return 0; 25 | 26 | /* Make sure we're ready to inject errors. */ 27 | int index = 0; 28 | u64 *e = enabled.lookup(&index); 29 | if (!e || *e == 0) 30 | return 0; 31 | if (*e > 1) 32 | goto fail; 33 | 34 | /* Have we seen this stacktrace yet? */ 35 | u64 key = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); 36 | u64 zero = 0; 37 | u64 *val = seen.lookup_or_init(&key, &zero); 38 | if (*val == 1) 39 | return 0; 40 | lock_xadd(val, 1); 41 | lock_xadd(e, 1); 42 | 43 | events.perf_submit(ctx, &key, sizeof(key)); 44 | bpf_trace_printk("overrding something\\n"); 45 | fail: 46 | bpf_override_return(ctx, rc); 47 | return 0; 48 | } 49 | """ 50 | 51 | error_tripped = 0 52 | 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("-o", "--override", required=True, 55 | help="The function to override") 56 | parser.add_argument("-r", "--retval", type=str, help="The return value to use") 57 | parser.add_argument("-e", "--executable", type=str, required=True, 58 | help="The command to run") 59 | parser.add_argument("-d", "--device", type=str, required=True, 60 | help="The device to error on") 61 | 62 | args = parser.parse_args() 63 | retval = "NULL" 64 | 65 | if args.retval is not None: 66 | retval = args.retval 67 | 68 | dev_path = os.path.realpath(args.device) 69 | dev_st = os.stat(dev_path) 70 | bpf_text = bpf_text.replace("MAJOR", str(os.major(dev_st.st_rdev))) 71 | bpf_text = bpf_text.replace("MINOR", str(os.minor(dev_st.st_rdev))) 72 | 73 | bpf_text = bpf_text.replace("RCVAL", retval) 74 | 75 | print("Loading error injection") 76 | b = BPF(text=bpf_text) 77 | 78 | # Load the kretprobe first, because we want the delete guy to be in place before 79 | # the add guy is in place, otherwise we could error out pids that are no longer 80 | # in our path and cause unfortunate things to happen. 81 | b.attach_kprobe(event=args.override, fn_name="override_function") 82 | p = None 83 | 84 | def handle_error(cpu, data, size): 85 | stackid = ct.cast(data, ct.POINTER(ct.c_ulonglong)).contents 86 | stack_traces = b.get_table("stack_traces") 87 | stack = stack_traces.walk(stackid.value) 88 | print("Injected error here") 89 | for addr in stack: 90 | print(" %s" % b.ksym(addr)) 91 | globals()['error_tripped'] = 1 92 | 93 | b["events"].open_perf_buffer(handle_error) 94 | 95 | missed_errors = 0 96 | 97 | while 1: 98 | print("Running command") 99 | error_tripped = 0 100 | t = b.get_table("enabled") 101 | t[0] = ct.c_int(1) 102 | 103 | p = Popen(args.executable) 104 | 105 | while error_tripped == 0: 106 | b.kprobe_poll(timeout=10) 107 | if p.poll() is not None: 108 | print("The command exited, breaking") 109 | break 110 | 111 | print("Waiting for the command to exit") 112 | p.wait() 113 | 114 | p = Popen(["umount", "/mnt/test"]) 115 | p.wait() 116 | 117 | if error_tripped == 0: 118 | missed_errors += 1 119 | print("Didn't hit anything, trying again") 120 | if missed_errors == 3: 121 | print("Error injection didn't trip anything, exiting") 122 | break 123 | else: 124 | missed_errors = 0 125 | 126 | t[0] = ct.c_int(0) 127 | 128 | p = Popen("./check.sh") 129 | if p.wait() == 1: 130 | print("Things went wrong, breaking") 131 | break 132 | 133 | # We have to remove in this order otherwise we could end up with a half 134 | # populated hasmap and overrding legitimate things. 135 | b.detach_kprobe(args.override) 136 | print("Exiting") 137 | 138 | -------------------------------------------------------------------------------- /enospc-stress/Makefile: -------------------------------------------------------------------------------- 1 | CLANG ?= clang 2 | LLVM_STRIP ?= llvm-strip 3 | BPFTOOL ?= bpftool 4 | ARCH := $(shell uname -m | sed 's/x86_64/x86/') 5 | CFLAGS := -Wall -Werror -D_GNU_SOURCE 6 | LIBS := -lpthread -lelf -lz 7 | LIBBPF_OBJ := /usr/lib64/libbpf.a 8 | 9 | all: enospc-stress 10 | 11 | # Generate vmlinux.h 12 | vmlinux.h: 13 | $(BPFTOOL) btf dump file /sys/kernel/btf/vmlinux format c > $@ 14 | 15 | # Build the bpf code itself 16 | %.bpf.o: vmlinux.h %.bpf.c 17 | $(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) -c $(filter %.c,$^) -o $@ 18 | $(LLVM_STRIP) -g $@ 19 | 20 | # Generate the skel file 21 | %.skel.h: %.bpf.o 22 | $(BPFTOOL) gen skel $< > $@ 23 | 24 | # This doesnt' work and I don't fucking know why 25 | #%.o: %.c %.skel.h common.h 26 | # $(CC) $(CFLAGS) -c enospc-stress.c -o $@ 27 | 28 | #%: %.o 29 | # $(CC) $(CFLAGS) $^ $(LIBBPF_OBJ) $(LIBS) -o $@ 30 | 31 | enospc-stress.o: enospc-stress.c enospc-stress.skel.h common.h 32 | $(CC) $(CFLAGS) -c enospc-stress.c -o $@ 33 | 34 | enospc-stress: enospc-stress.o 35 | $(CC) $(CFLAGS) $^ $(LIBBPF_OBJ) $(LIBS) -o $@ 36 | -------------------------------------------------------------------------------- /enospc-stress/common.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_H 2 | #define __COMMON_H 3 | 4 | #ifndef u64 5 | #define u64 uint64_t 6 | #endif 7 | 8 | struct event { 9 | u64 flags; 10 | u64 total_bytes; 11 | u64 bytes_used; 12 | u64 bytes_pinned; 13 | u64 bytes_may_use; 14 | u64 bytes_reserved; 15 | u64 bytes_readonly; 16 | u64 global_rsv; 17 | u64 trans_rsv; 18 | u64 delayed_refs_rsv; 19 | u64 delayed_rsv; 20 | }; 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /enospc-stress/enospc-stress.bpf.c: -------------------------------------------------------------------------------- 1 | #include "vmlinux.h" 2 | #include 3 | #include 4 | #include 5 | #include "common.h" 6 | 7 | char LICENSE[] SEC("license") = "Dual BSD/GPL"; 8 | 9 | struct { 10 | __uint(type, BPF_MAP_TYPE_RINGBUF); 11 | __uint(max_entries, 256 * 1024); 12 | } rb SEC(".maps"); 13 | 14 | struct { 15 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 16 | __uint(max_entries, 1); 17 | __type(key, 1); 18 | __type(value, struct event); 19 | } heap SEC(".maps"); 20 | 21 | SEC("tp/btrfs/btrfs_fail_all_tickets") 22 | int handle_fail_all_tickets(struct trace_event_raw_btrfs_dump_space_info *ctx) 23 | { 24 | struct event *e; 25 | int zero = 0; 26 | 27 | e = bpf_map_lookup_elem(&heap, &zero); 28 | if (!e) 29 | return 0; 30 | 31 | e->flags = ctx->flags; 32 | e->total_bytes = ctx->total_bytes; 33 | e->bytes_used = ctx->bytes_used; 34 | e->bytes_pinned = ctx->bytes_pinned; 35 | e->bytes_may_use = ctx->bytes_may_use; 36 | e->bytes_reserved = ctx->bytes_reserved; 37 | e->bytes_readonly = ctx->bytes_readonly; 38 | e->global_rsv = ctx->global_reserved; 39 | e->trans_rsv = ctx->trans_reserved; 40 | e->delayed_refs_rsv = ctx->delayed_refs_reserved; 41 | e->delayed_rsv = ctx->delayed_reserved; 42 | 43 | bpf_ringbuf_output(&rb, e, sizeof(*e), 0); 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /enospc-stress/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkfs.btrfs -f /dev/nvme0n1 -b 100g 4 | mount /dev/nvme0n1 /mnt/scratch 5 | ./enospc-stress -r 300 /mnt/scratch 6 | btrfs fi usage /mnt/scratch 7 | umount /mnt/scratch 8 | -------------------------------------------------------------------------------- /error-injection-stress.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | from time import sleep 3 | from subprocess import Popen 4 | import argparse 5 | import sys 6 | import os 7 | import ctypes as ct 8 | 9 | bpf_text = """ 10 | #include 11 | #include 12 | 13 | BPF_CGROUP_ARRAY(cgroup, 1); 14 | BPF_HASH(seen, u64); 15 | BPF_ARRAY(enabled, u64, 1); 16 | BPF_PERF_OUTPUT(events); 17 | BPF_STACK_TRACE(stack_traces, 10240); 18 | 19 | int override_function(struct pt_regs *ctx) 20 | { 21 | /* Filter on our cgroup. */ 22 | if (cgroup.check_current_task(0) <= 0) 23 | return 0; 24 | 25 | /* Make sure we're ready to inject errors. */ 26 | int index = 0; 27 | u64 *e = enabled.lookup(&index); 28 | if (!e || *e == 0) 29 | return 0; 30 | 31 | /* Have we seen this stacktrace yet? */ 32 | u64 key = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); 33 | u64 zero = 0; 34 | u64 *val = seen.lookup_or_init(&key, &zero); 35 | if (*val == 1) 36 | return 0; 37 | lock_xadd(val, 1); 38 | lock_xadd(e, 1); 39 | 40 | events.perf_submit(ctx, &key, sizeof(key)); 41 | bpf_trace_printk("overrding something\\n"); 42 | unsigned long rc = RCVAL; 43 | bpf_override_return(ctx, rc); 44 | return 0; 45 | } 46 | """ 47 | 48 | error_tripped = 0 49 | 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument("-o", "--override", required=True, 52 | help="The function to override") 53 | parser.add_argument("-r", "--retval", type=str, help="The return value to use") 54 | parser.add_argument("-e", "--executable", type=str, required=True, 55 | help="The command to run") 56 | parser.add_argument("-c", "--cgroup", type=str, required=True, 57 | help="Path to the cgroup we'll be using for this") 58 | 59 | args = parser.parse_args() 60 | retval = "NULL" 61 | 62 | if args.retval is not None: 63 | retval = args.retval 64 | 65 | bpf_text = bpf_text.replace("RCVAL", retval) 66 | 67 | fd = os.open(args.cgroup, os.O_RDONLY) 68 | 69 | print("Loading error injection") 70 | b = BPF(text=bpf_text) 71 | 72 | # Load the cgroup id into the table 73 | t = b.get_table("cgroup") 74 | t[0] = fd 75 | 76 | # Load the kretprobe first, because we want the delete guy to be in place before 77 | # the add guy is in place, otherwise we could error out pids that are no longer 78 | # in our path and cause unfortunate things to happen. 79 | b.attach_kprobe(event=args.override, fn_name="override_function") 80 | 81 | def handle_error(cpu, data, size): 82 | stackid = ct.cast(data, ct.POINTER(ct.c_ulonglong)).contents 83 | stack_traces = b.get_table("stack_traces") 84 | stack = stack_traces.walk(stackid.value) 85 | print("Injected error here") 86 | for addr in stack: 87 | print(" %s" % b.ksym(addr)) 88 | globals()['error_tripped'] = 1 89 | 90 | b["events"].open_perf_buffer(handle_error) 91 | 92 | while 1: 93 | print("Running command") 94 | p = Popen(args.executable) 95 | 96 | error_tripped = 0 97 | t = b.get_table("enabled") 98 | t[0] = ct.c_int(1) 99 | while error_tripped == 0: 100 | b.kprobe_poll(timeout=30) 101 | if p.poll() is not None: 102 | print("The command exited, breaking") 103 | break 104 | 105 | print("Waiting for the command to exit") 106 | p.wait() 107 | 108 | if error_tripped == 0: 109 | print("Error injection didn't trip anything, exiting") 110 | break 111 | 112 | # We have to remove in this order otherwise we could end up with a half 113 | # populated hasmap and overrding legitimate things. 114 | b.detach_kprobe(args.override) 115 | print("Exiting") 116 | 117 | -------------------------------------------------------------------------------- /error.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRATCH_DEV=/dev/vg0/lv0 4 | SCRATCH_MNT=/mnt/test 5 | FSSTRESS_PROG=/root/xfstests-dev/ltp/fsstress 6 | 7 | mount $SCRATCH_DEV $SCRATCH_MNT || exit 1 8 | 9 | btrfs balance start --full-balance $SCRATCH_MNT 10 | while [ 1 ] 11 | do 12 | btrfs ba status $SCRATCH_MNT && break 13 | sleep 1 14 | done 15 | 16 | while [ 1 ] 17 | do 18 | umount $SCRATCH_MNT && break 19 | done 20 | -------------------------------------------------------------------------------- /find-missing-range.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import re 3 | import sys 4 | 5 | class Range: 6 | def __init__(self, off, length, times, reserve): 7 | self._off = off 8 | self._len = length 9 | self._times = times 10 | self._reserve = reserve 11 | 12 | def contains(self, off): 13 | return self._off <= off and (self._off + self._len) > off 14 | 15 | def __repr__(self): 16 | return "off={} len={} times={} reserve={}".format(self._off, self._len, 17 | self._times, 18 | self._reserve) 19 | 20 | def collapse_list(l): 21 | newlist = [] 22 | while len(l): 23 | remaining = [] 24 | cur = l[0] 25 | for i in range(1, len(l)): 26 | n = l[i] 27 | if (cur._off + cur._len) == n._off and n._times == cur._times: 28 | cur._len += n._len 29 | else: 30 | remaining.append(n) 31 | newlist.append(cur) 32 | l = sorted(remaining, key=lambda x: x._off) 33 | newlist.sort(key=lambda x: x._off) 34 | return newlist 35 | 36 | def carve(r, n, excess): 37 | if r._off < n._off: 38 | x = Range(r._off, n._off - r._off, r._times, r._reserve) 39 | excess.append(x) 40 | r._off = n._off 41 | r._len -= x._len 42 | if r._off + r._len > n._off + n._len: 43 | x = Range(n._off + n._len, (r._off + r._len) - (n._off + n._len), 44 | r._times, r._reserve) 45 | excess.append(x) 46 | r._len -= x._len 47 | 48 | def find(r, l): 49 | for x in l: 50 | if x.contains(r._off) or r.contains(x._off): 51 | return x 52 | return None 53 | 54 | def add_entry(r, l): 55 | n = find(r, l) 56 | if n is None: 57 | l.append(r) 58 | return 59 | while r._off != n._off or r._len != n._len: 60 | tmp = [] 61 | if r._off == n._off: 62 | if r._len > n._len: 63 | carve(r, n, tmp) 64 | else: 65 | carve(n, r, tmp) 66 | elif r.contains(n._off): 67 | carve(r, n, tmp) 68 | elif n.contains(r._off): 69 | carve(n, r, tmp) 70 | else: 71 | print("FUCK") 72 | sys.exit(1) 73 | for x in tmp: 74 | add_entry(x, l) 75 | if r._off != n._off or r._len != n._len: 76 | print("We fucked up") 77 | sys.exit(1) 78 | n._times += r._times 79 | 80 | getre = re.compile("Get off=(\d+) bytes=(\d+) times=(\d+)") 81 | putre = re.compile("Put off=(\d+) bytes=(\d+) times=(\d+)") 82 | 83 | getlist = [] 84 | putlist = [] 85 | 86 | totalput = 0 87 | totalget = 0 88 | f = open("out.txt") 89 | for line in iter(f): 90 | reserve = True 91 | m = getre.match(line) 92 | if m is None: 93 | m = putre.match(line) 94 | if m is None: 95 | continue 96 | reserve = False 97 | r = Range(int(m.group(1)), int(m.group(2)), int(m.group(3)), reserve) 98 | if reserve: 99 | totalget += (r._len * r._times) 100 | add_entry(r, getlist) 101 | else: 102 | totalput += (r._len * r._times) 103 | add_entry(r, putlist) 104 | 105 | getlist.sort(key=lambda x: x._off) 106 | putlist.sort(key=lambda x: x._off) 107 | getlist = collapse_list(getlist) 108 | putlist = collapse_list(putlist) 109 | 110 | offset = 0 111 | print("getlist") 112 | for i in getlist: 113 | if i._off != offset: 114 | for n in range(offset, i._off, 4096): 115 | print('.', end='') 116 | for n in range(i._off, i._off + i._len, 4096): 117 | print("{}".format(i._times), end='') 118 | offset = i._off + i._len 119 | offset = 0 120 | print("\nputlist") 121 | for i in putlist: 122 | if i._off != offset: 123 | for n in range(offset, i._off, 4096): 124 | print('.', end='') 125 | for n in range(i._off, i._off + i._len, 4096): 126 | print("{}".format(i._times), end='') 127 | offset = i._off + i._len 128 | 129 | print("\ntotat get {} totalput {}".format(totalget, totalput)) 130 | print("Starting phase one, len {}, getlist len {}".format(len(putlist), 131 | len(getlist))) 132 | 133 | loops=1 134 | while True: 135 | nextput = [] 136 | nextget = [] 137 | for r in putlist: 138 | n = find(r, getlist) 139 | if loops > 1: 140 | print("doing r {} n {}".format(r, n)) 141 | if n is None: 142 | print("breaking!?!?") 143 | break 144 | while r._off != n._off or r._len != n._len: 145 | if r._off == n._off: 146 | if r._len > n._len: 147 | print("this is what's happening? {} {}".format(r, n)) 148 | carve(r, n, nextput) 149 | else: 150 | print("adding some shit to nextget") 151 | carve(n, r, nextget) 152 | elif r.contains(n._off): 153 | print("r {} contains {}".format(r._off, n._off)) 154 | carve(r, n, nextput) 155 | elif n.contains(r._off): 156 | print("adding some shit to nextget") 157 | carve(n, r, nextget) 158 | else: 159 | print("FUCK") 160 | sys.exit(1) 161 | if r._off != n._off or r._len != n._len: 162 | print("We fucked up r={} n={}".format(r, n)) 163 | sys.exit(1) 164 | getlist.remove(n) 165 | if r._times != n._times: 166 | print("this is the fucked up one {} {}".format(r, n)) 167 | if r._times <= n._times: 168 | n._times -= r._times 169 | if n._times != 0: 170 | print("ALSKJDF:DAJFadding some shit to nextget") 171 | nextget.append(n) 172 | else: 173 | r._times -= n._times 174 | nextput.append(r) 175 | nextget.extend(getlist) 176 | print("nextget len {}".format(len(nextget))) 177 | nextget.sort(key=lambda x: x._off) 178 | print("nextget len {}".format(len(nextget))) 179 | getlist = collapse_list(nextget) 180 | print("getlist len {}".format(len(getlist))) 181 | print("putlist len {}".format(len(putlist))) 182 | loops += 1 183 | if len(putlist) == len(nextput): 184 | print("we're done loops {}".format(loops)) 185 | print(getlist) 186 | print(putlist) 187 | break 188 | putlist = sorted(nextput, key=lambda x: x._off) 189 | 190 | for x in getlist: 191 | print("off={} len={} times={} remaining".format(x._off, x._len, x._times)) 192 | -------------------------------------------------------------------------------- /fuck-you-pagecache.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from bcc import BPF 3 | import ctypes as ct 4 | 5 | bpf_text = """ 6 | #include 7 | #include 8 | #include 9 | 10 | typedef struct actor_s { 11 | u64 pid; 12 | u64 stackid; 13 | u64 read_size; 14 | } actor_t; 15 | 16 | BPF_HASH(plugs, u64); 17 | BPF_HASH(writes, u64); 18 | BPF_HASH(traces, actor_t); 19 | BPF_HASH(reads, u64); 20 | BPF_HASH(readahead, u64); 21 | BPF_PERF_OUTPUT(events); 22 | BPF_STACK_TRACE(stack_traces, 1024); 23 | BPF_PERF_OUTPUT(read_events); 24 | 25 | int trace_blk_start_plug(struct pt_regs *ctx) 26 | { 27 | u64 pid = bpf_get_current_pid_tgid(); 28 | u64 tmp = 12345; 29 | 30 | plugs.update(&pid, &tmp); 31 | return 0; 32 | } 33 | 34 | int trace_blk_finish_plug(struct pt_regs *ctx) 35 | { 36 | u64 pid = bpf_get_current_pid_tgid(); 37 | plugs.delete(&pid); 38 | return 0; 39 | } 40 | 41 | int trace_vfs_write(struct pt_regs *ctx) 42 | { 43 | u64 pid = bpf_get_current_pid_tgid(); 44 | u64 tmp = 12345; 45 | 46 | writes.update(&pid, &tmp); 47 | return 0; 48 | } 49 | 50 | int trace_vfs_read(struct pt_regs *ctx, struct file *file, char *buf, 51 | size_t count) 52 | { 53 | u64 pid = bpf_get_current_pid_tgid(); 54 | u64 tmp = count; 55 | reads.update(&pid, &tmp); 56 | return 0; 57 | } 58 | 59 | int trace_vfs_read_ret(struct pt_regs *regs) 60 | { 61 | u64 pid = bpf_get_current_pid_tgid(); 62 | reads.delete(&pid); 63 | return 0; 64 | } 65 | 66 | int trace_vfs_write_ret(struct pt_regs *ctx) 67 | { 68 | u64 pid = bpf_get_current_pid_tgid(); 69 | writes.delete(&pid); 70 | return 0; 71 | } 72 | 73 | int trace_add_to_page_cache_locked(struct pt_regs *ctx, struct page *page, 74 | struct address_space *mapping, pgoff_t offset) 75 | { 76 | u64 magic = mapping->host->i_sb->s_magic; 77 | 78 | if (magic != 0x58465342) 79 | return 0; 80 | 81 | u64 pid = bpf_get_current_pid_tgid(); 82 | u64 read_size = 0; 83 | u64 *tmp; 84 | 85 | tmp = writes.lookup(&pid); 86 | if (tmp) 87 | return 0; 88 | 89 | tmp = plugs.lookup(&pid); 90 | if (tmp) 91 | return 0; 92 | 93 | tmp = reads.lookup(&pid); 94 | if (tmp) 95 | read_size = *tmp; 96 | u64 stackid = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); 97 | u64 index = offset; 98 | u64 zero = 0; 99 | 100 | actor_t actor = { 101 | .pid = pid >> 32, 102 | .stackid = stackid, 103 | .read_size = read_size, 104 | }; 105 | tmp = traces.lookup_or_init(&actor, &zero); 106 | (*tmp)++; 107 | events.perf_submit(ctx, &index, sizeof(index)); 108 | return 0; 109 | } 110 | 111 | int trace_ondemand_readahead(struct pt_regs *ctx, struct address_space *mapping, 112 | struct file_ra_state *ra, struct file *filp, 113 | bool hit_readahead_marker, pgoff_t offset) 114 | { 115 | u64 magic = mapping->host->i_sb->s_magic; 116 | 117 | if (hit_readahead_marker) 118 | return 0; 119 | if (magic != 0x58465342) 120 | return 0; 121 | u64 pid = bpf_get_current_pid_tgid(); 122 | u64 read_offset = offset; 123 | readahead.update(&pid, &read_offset); 124 | return 0; 125 | } 126 | 127 | int trace_do_page_cache_readahead(struct pt_regs *ctx, struct address_space *mapping, 128 | struct file *filep, pgoff_t start, unsigned long nr_to_read) 129 | { 130 | u64 pid = bpf_get_current_pid_tgid(); 131 | u64 *tmp; 132 | tmp = readahead.lookup(&pid); 133 | if (!tmp) 134 | return 0; 135 | if (*tmp != start) { 136 | actor_t actor = { 137 | .pid = pid, 138 | .stackid = start, 139 | .read_size = *tmp, 140 | }; 141 | read_events.perf_submit(ctx, &actor, sizeof(actor)); 142 | } 143 | readahead.delete(&pid); 144 | return 0; 145 | } 146 | """ 147 | 148 | b = BPF(text=bpf_text) 149 | b.attach_kprobe(event="blk_start_plug", fn_name="trace_blk_start_plug") 150 | b.attach_kprobe(event="blk_finish_plug", fn_name="trace_blk_finish_plug") 151 | b.attach_kprobe(event="__vfs_write", fn_name="trace_vfs_write") 152 | b.attach_kretprobe(event="__vfs_write", fn_name="trace_vfs_write_ret") 153 | b.attach_kprobe(event="vfs_writev", fn_name="trace_vfs_write") 154 | b.attach_kretprobe(event="vfs_writev", fn_name="trace_vfs_write_ret") 155 | b.attach_kprobe(event="__vfs_read", fn_name="trace_vfs_read") 156 | b.attach_kretprobe(event="__vfs_read", fn_name="trace_vfs_read_ret") 157 | b.attach_kprobe(event="__add_to_page_cache_locked", fn_name="trace_add_to_page_cache_locked") 158 | b.attach_kprobe(event="ondemand_readahead", fn_name="trace_ondemand_readahead") 159 | b.attach_kprobe(event="__do_page_cache_readahead", fn_name="trace_do_page_cache_readahead") 160 | 161 | class Actor(ct.Structure): 162 | _fields_ = [ 163 | ("pid", ct.c_ulonglong), 164 | ("stackid", ct.c_ulonglong), 165 | ("read_size", ct.c_ulonglong), 166 | ] 167 | 168 | def print_data(cpu, data, size): 169 | event = ct.cast(data, ct.POINTER(ct.c_ulonglong)).contents 170 | print("added page out of band index %s" % (event.value)) 171 | 172 | def print_read_events(cpu, data, size): 173 | event = ct.cast(data, ct.POINTER(Actor)).contents 174 | print("mismatch offset, wanted %s, got %s, pid %s" % (event.read_size, event.stackid, event.pid)) 175 | 176 | b["events"].open_perf_buffer(print_data) 177 | b["read_events"].open_perf_buffer(print_read_events) 178 | traces = b.get_table("traces") 179 | stack_traces = b.get_table("stack_traces") 180 | while 1: 181 | b.kprobe_poll() 182 | for k,v in traces.items(): 183 | stack = stack_traces.walk(k.stackid) 184 | print("Pid %d read %d" % (k.pid, k.read_size)) 185 | for addr in stack: 186 | print(" %s" % b.ksym(addr)) 187 | print("\n") 188 | traces.clear() 189 | -------------------------------------------------------------------------------- /inject-error.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | from time import sleep 3 | from subprocess import Popen 4 | import argparse 5 | import sys 6 | 7 | bpf_text = """ 8 | #include 9 | #include 10 | 11 | BPF_HASH(fail_pids, u64); 12 | 13 | int trigger_function(struct pt_regs *ctx) 14 | { 15 | u64 pid = bpf_get_current_pid_tgid(); 16 | u64 zero = 0; 17 | u64 *val; 18 | val = fail_pids.lookup_or_init(&pid, &zero); 19 | lock_xadd(val, 1); 20 | return 0; 21 | } 22 | 23 | int trigger_function_ret(struct pt_regs *ctx) 24 | { 25 | u64 pid = bpf_get_current_pid_tgid(); 26 | u64 *val; 27 | val = fail_pids.lookup(&pid); 28 | if (!val) 29 | return 0; 30 | lock_xadd(val, -1); 31 | return 0; 32 | } 33 | 34 | int override_function(struct pt_regs *ctx) 35 | { 36 | u64 pid = bpf_get_current_pid_tgid(); 37 | u64 *val; 38 | 39 | val = fail_pids.lookup(&pid); 40 | if (!val) 41 | return 0; 42 | if (*val != FAIL_CNT) 43 | return 0; 44 | 45 | bpf_trace_printk("overrding something\\n"); 46 | unsigned long rc = RCVAL; 47 | bpf_override_return(ctx, rc); 48 | return 0; 49 | } 50 | """ 51 | 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("-o", "--override", help="The function to override") 54 | parser.add_argument("-r", "--retval", type=int, help="The return value to use") 55 | parser.add_argument("-t", "--trigger", action='append', 56 | help="The function that must be called to trigger the error injection") 57 | parser.add_argument("-d", "--delay", type=int, 58 | help="The delay to wait before injecting the error") 59 | parser.add_argument("-T", "--timeout", type=int, 60 | help="Timeout after error injection has been loaded to wait on the task") 61 | parser.add_argument("COMMAND", nargs='+', help="The command to run") 62 | 63 | args = parser.parse_args() 64 | retval = -12 65 | 66 | if not args.override: 67 | print("Must specify an override function") 68 | sys.exit(1) 69 | if not args.trigger: 70 | print("Must specify a function as the trigger function") 71 | sys.exit(1) 72 | if args.retval: 73 | retval = args.retval 74 | 75 | bpf_text = bpf_text.replace("RCVAL", str(retval)) 76 | bpf_text = bpf_text.replace("FAIL_CNT", str(len(args.trigger))) 77 | 78 | print("Running command") 79 | p = Popen(args.COMMAND) 80 | if args.delay: 81 | print("Sleeping for {} seconds".format(args.delay)) 82 | sleep(args.delay) 83 | 84 | print("Loading error injection") 85 | b = BPF(text=bpf_text) 86 | 87 | # Load the kretprobe first, because we want the delete guy to be in place before 88 | # the add guy is in place, otherwise we could error out pids that are no longer 89 | # in our path and cause unfortunate things to happen. 90 | for t in args.trigger: 91 | b.attach_kretprobe(event=t, fn_name="trigger_function_ret") 92 | for t in args.trigger: 93 | b.attach_kprobe(event=t, fn_name="trigger_function") 94 | b.attach_kprobe(event=args.override, fn_name="override_function") 95 | 96 | print("Dropping caches") 97 | f = open("/proc/sys/vm/drop_caches", "w") 98 | f.write("3") 99 | f.close() 100 | 101 | print("Waiting for the command to exit") 102 | while p.poll() is None: 103 | if args.timeout: 104 | sleep(args.timeout) 105 | if p.poll() is None: 106 | print("Killing the task, it didn't die") 107 | f = open("nofail.txt", "a") 108 | f.write(args.trigger + "\n") 109 | f.close() 110 | p.kill() 111 | p.wait() 112 | break 113 | p.wait() 114 | 115 | # We have to remove in this order otherwise we could end up with a half 116 | # populated hasmap and overrding legitimate things. 117 | b.detach_kprobe(args.override) 118 | for t in args.trigger: 119 | b.detach_kprobe(t) 120 | for t in args.trigger: 121 | b.detach_kretprobe(t) 122 | print("Exiting") 123 | -------------------------------------------------------------------------------- /kernelparse/codepaths.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from kernelparse import FileParser,FunctionTree 3 | import os 4 | import argparse 5 | 6 | def find_all_paths(func, path, visited): 7 | path = path + [func.name] 8 | visited.append(func.name) 9 | if len(func.callers) == 0: 10 | return [path] 11 | paths = [] 12 | for c in func.callers.keys(): 13 | if c in visited: 14 | continue 15 | newpaths = find_all_paths(func.callers[c], path, visited) 16 | for newpath in newpaths: 17 | paths.append(newpath) 18 | return paths 19 | 20 | parser = argparse.ArgumentParser(description="Find the callers of a specific function") 21 | parser.add_argument("-d", "--directory", action='append', 22 | help="Directories to scan") 23 | parser.add_argument("function", help="The function to find") 24 | args = parser.parse_args() 25 | 26 | directories = ["."] 27 | if args.directory is not None: 28 | directories = args.directory 29 | 30 | p = FileParser() 31 | ft = FunctionTree() 32 | 33 | for d in directories: 34 | p.parse_path(d, ft) 35 | 36 | if args.function not in ft.functions: 37 | print("Couldn't find the function call {}".format(args.function)) 38 | print(ft.functions.keys()) 39 | else: 40 | paths = find_all_paths(ft.functions[args.function], [], []) 41 | psorted = sorted(paths, key=lambda x:len(x)) 42 | for i in psorted: 43 | print(i[-1]) 44 | -------------------------------------------------------------------------------- /kernelparse/kernelparse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | class Function: 5 | def __init__(self, name, definition, defined=False): 6 | self.name = name 7 | self.defined = defined 8 | self.calls = {} 9 | self.callers = {} 10 | self.args = [] 11 | self.recurses = False 12 | self.definition = definition 13 | self.content = "" 14 | 15 | def add_content(self, buf): 16 | self.content += buf 17 | 18 | def add_call(self, call, args): 19 | if call.name in self.calls: 20 | self.calls[call.name]['count'] += 1 21 | if args not in self.calls[call.name]['args']: 22 | self.calls[call.name]['args'].extend([args]) 23 | return 24 | self.calls[call.name] = {} 25 | self.calls[call.name]['func'] = call 26 | self.calls[call.name]['count'] = 1 27 | self.calls[call.name]['args'] = [args] 28 | 29 | def add_caller(self, caller): 30 | if caller.name == self.name: 31 | return 32 | if caller.name in self.callers: 33 | return 34 | self.callers[caller.name] = caller 35 | 36 | def add_args(self, args): 37 | if args not in self.args: 38 | self.args.extend([args]) 39 | 40 | def contains_calls(self, funcs): 41 | if len(self.calls) == 0: 42 | return False 43 | if not set(funcs).isdisjoint(self.calls.keys()): 44 | return True 45 | for f in self.calls.keys(): 46 | if self.calls[f]['func'].contains_calls(funcs): 47 | return True 48 | return False 49 | 50 | def _count_calls(self, func, seen): 51 | if self.name in seen: 52 | return 0 53 | seen.append(self.name) 54 | if len(self.calls.keys()) == 0: 55 | return 0 56 | count = 0 57 | for f in self.calls.keys(): 58 | if f == func: 59 | count += self.calls[f]['count'] 60 | continue 61 | count += self.calls[f]['func']._count_calls(func, seen) 62 | return count 63 | 64 | def count_calls(self, func): 65 | return self._count_calls(func, []) 66 | 67 | class FunctionTree: 68 | def __init__(self, debug=False): 69 | self.debug = debug 70 | self.functions = {} 71 | 72 | def add_function(self, name, definition): 73 | if name in self.functions: 74 | self.functions[name].defined = True 75 | self.functions[name].definition = definition 76 | return 77 | # print("adding function '{}'".format(name)) 78 | f = Function(name, definition, True) 79 | self.functions[name] = f 80 | 81 | def add_func_call(self, func, call, args): 82 | c = None 83 | # print("adding call '{}'".format(call)) 84 | # From 85 | if func.name == call: 86 | self.recurses = True 87 | return 88 | if call not in self.functions: 89 | c = Function(call, "") 90 | self.functions[call] = c 91 | else: 92 | c = self.functions[call] 93 | func.add_call(c, args) 94 | c.add_caller(func) 95 | 96 | class FileParser: 97 | _GLOBAL = 0 98 | _IN_BLOCK = 1 99 | _IN_FUNCTION = 2 100 | # _IN_COMMENT = 2 101 | # _IN_DIRECTIVE = 3 102 | _IN_PAREN = 4 103 | _keywords = ['auto', 'break', 'case', 'char', 'const', 'continue', 104 | 'default', 'do', 'double', 'else', 'enum', 'extern', 105 | 'float', 'for', 'goto', 'if', 'int', 'long', 'register', 106 | 'return', 'short', 'signed', 'sizeof', 'static', 107 | 'struct', 'switch', 'typedef', 'union', 'unsigned', 'void', 108 | 'volatile', 'while'] 109 | 110 | def __init__(self, debug=False): 111 | self.state = [] 112 | 113 | self._function_re = re.compile("[\s\w]+\s+(\w+)\s*\(.*\).*{", re.DOTALL) 114 | self._directive_re = re.compile("^\s*\#.*") 115 | self._comment_block_start_re = re.compile("^\s*\/\*") 116 | self._comment_block_end_re = re.compile(".*\*/") 117 | self._call_re = re.compile("[-()=+/*!|&<>%~^\s,]*(\w+)\s*(\(.*\))", 118 | re.DOTALL|re.MULTILINE) 119 | #self._statement_re = re.compile(".*[;{}]+\s*(?:/\*)*.*(?:\*/)*$", 120 | # re.DOTALL|re.MULTILINE) 121 | self._statement_re = re.compile(".*[;{}]$", 122 | re.DOTALL|re.MULTILINE) 123 | self._special_eol_re = re.compile("\)$", re.MULTILINE) 124 | self._single_line_cond_re = re.compile("^.+\(.*\).+;$") 125 | self.debug = debug 126 | 127 | def _grab_args(self, line): 128 | end_pos = 0 129 | cur_paren_count = 1 130 | for i in range(1, len(line)): 131 | if line[i] == '(': 132 | cur_paren_count += 1 133 | elif line[i] == ')': 134 | cur_paren_count -= 1 135 | if cur_paren_count == 0: 136 | end_pos = i 137 | break 138 | if end_pos == 0: 139 | return "" 140 | return line[1:end_pos] 141 | 142 | # if self._comment_block_start_re.match(line): 143 | # if cur != self._IN_COMMENT: 144 | # self.state.append(self._IN_COMMENT) 145 | # cur = self._IN_COMMENT 146 | # if cur == self._IN_COMMENT and self._comment_block_end_re.match(line): 147 | # self.state.pop() 148 | # return True 149 | # if cur == self._IN_COMMENT: 150 | # return True 151 | 152 | if cur == self._GLOBAL and ';' in line: 153 | return True 154 | return False 155 | 156 | def _collapse_nonblock_statement(self, content): 157 | ret = "" 158 | cur = "" 159 | for s in content.split('\n'): 160 | tmp = cur + s; 161 | open_count = tmp.count('(') 162 | close_count = tmp.count(')') 163 | if open_count == close_count: 164 | if cur == "": 165 | cur = s 166 | else: 167 | cur += " " + s.strip() 168 | ret += cur + '\n' 169 | cur = "" 170 | continue 171 | if cur == "": 172 | cur = s 173 | else: 174 | cur += " " + s.strip() 175 | ret += cur + '\n' 176 | return ret 177 | 178 | def _handle_block(self, line): 179 | if '}' not in line and '{' not in line: 180 | return 181 | 182 | if '{' in line: 183 | self.state.append(self._IN_BLOCK) 184 | if '}' in line: 185 | self.state.pop() 186 | if self.cur_function is None: 187 | return 188 | if self._IN_FUNCTION not in self.state: 189 | content = self.cur_function.content 190 | 191 | # strip the tailing } if there is one 192 | content = "".join(content.rsplit('}', 1)) 193 | # Strip the excess whitespace, this makes testcases easier to write. 194 | self.cur_function.content = content.rstrip() 195 | 196 | def _handle_function_call(self, ft, buf): 197 | if self._IN_FUNCTION not in self.state: 198 | return 199 | 200 | m = self._call_re.match(buf) 201 | if m is None: 202 | return 203 | remaining = m.group(2) 204 | if m.group(1) not in self._keywords: 205 | # grab the args to save into this call 206 | args = self._grab_args(m.group(2)) 207 | ft.add_func_call(self.cur_function, m.group(1), args); 208 | self._handle_function_call(ft, args) 209 | remaining = m.group(2).replace(args, "", 1) 210 | else: 211 | # strip the first and last () 212 | remaining = m.group(2).replace("(", "", 1) 213 | remaining = "".join(remaining.rsplit(")", 1)) 214 | self._handle_function_call(ft, remaining) 215 | 216 | def _handle_function_def(self, ft, buf): 217 | if self.state[-1] != self._GLOBAL: 218 | return False 219 | 220 | m = self._function_re.match(buf) 221 | if m is None: 222 | if self.debug: 223 | print("Couldn't match '{}'".format(buf)) 224 | return False 225 | definition = "".join(buf.replace('\n', ' ').rsplit('{', 1)).strip() 226 | definition = re.sub('\s+', ' ', definition) 227 | definition = re.sub('\( ', '(', definition) 228 | ft.add_function(m.group(1), definition) 229 | self.state.append(self._IN_FUNCTION) 230 | self.cur_function = ft.functions[m.group(1)] 231 | return True 232 | 233 | def _strip_comments(self, buf): 234 | buf = re.sub("/\*.*\*/", '', buf) 235 | 236 | # no more comments, return 237 | if re.search("/\*.*\*/", buf, flags=re.DOTALL) is None: 238 | return buf 239 | 240 | bufarray = buf.split('\n') 241 | final = [] 242 | incomment = False 243 | for b in bufarray: 244 | if incomment and re.search("\*/", b) is not None: 245 | final.append(re.sub(".*\*/", "", b)) 246 | incomment = False 247 | continue 248 | if re.search("/\*", b) is not None: 249 | final.append(re.sub("/\*.*", "", b)) 250 | incomment = True 251 | continue 252 | if not incomment: 253 | final.append(b) 254 | final = [l for l in final if re.search("^\s*$", l) is None] 255 | return "\n".join(final) + "\n" 256 | 257 | def _strip_macros(self, buf): 258 | ret = "" 259 | inmacro = False 260 | for l in buf.split('\n'): 261 | if not inmacro and '#' not in l: 262 | ret += l + '\n' 263 | continue 264 | if re.search('\\\s*$', l) is None: 265 | if inmacro: 266 | inmacro = False 267 | continue 268 | else: 269 | inmacro = True 270 | return ret 271 | 272 | def _make_pretty(self, buf): 273 | ret = "" 274 | indent = 0 275 | for l in buf.split('\n'): 276 | l = l.strip() 277 | if re.search("\w+:", l): 278 | ret += '\n' + l 279 | continue 280 | if '}' in l: 281 | indent -= 1 282 | ret += '\n' + ' ' * indent + l 283 | if '{' in l: 284 | indent += 1 285 | return ret 286 | 287 | def _expand_syscalls(self, buf): 288 | ret = "" 289 | for l in buf.split('\n'): 290 | if "SYSCALL_DEFINE" not in l: 291 | ret += l + '\n' 292 | continue 293 | m = re.match("SYSCALL_DEFINE\d*\((\w+),\s*(.*)\)$", l) 294 | if m is None: 295 | print("Our regex didn't work for line '{}'".format(l)) 296 | ret += l + '\n' 297 | continue 298 | tmp = "int {}(".format(m.group(1)) 299 | vartype = True 300 | for i in m.group(2).split(','): 301 | if vartype: 302 | tmp += "{} ".format(i.strip()) 303 | vartype = False 304 | else: 305 | tmp += "{}, ".format(i.strip()) 306 | vartype = True 307 | ret += "{})".format("".join(tmp.rsplit(',', 1)).strip()) 308 | return ret 309 | 310 | def parse_file(self, f, ft): 311 | infunction = 0 312 | self.state = [self._GLOBAL] 313 | self.cur_function = None 314 | buf = "" 315 | 316 | # Strip the file down to a reasonable set of statements 317 | content = f.read() 318 | 319 | # First strip all the comments 320 | content = self._strip_comments(content) 321 | 322 | # Strip all the macros 323 | content = self._strip_macros(content) 324 | 325 | # Cull any string literals, they could have problematic things and we 326 | # just don't care 327 | content = re.sub("[\"\'].*[\"\']", "STRING", content) 328 | 329 | # Strip any empty lines 330 | content = re.sub("^\s*$", '', content, flags=re.MULTILINE) 331 | 332 | # Just for consistency with testing replace tabs with spaces 333 | content = re.sub('\t', ' ', content) 334 | 335 | content = re.sub('\s+$', '', content, flags=re.MULTILINE) 336 | 337 | # Make sure open braces are on their own line, otherwise it confuses the 338 | # statement stuff. 339 | content = re.sub('\{(?!\n)', '{\n', content) 340 | 341 | # We want to make sure that logical statements are all on one line, so 342 | # things like 343 | # if (a > 344 | # b) 345 | # 346 | # gets turned into 347 | # if (a > b) 348 | content = re.sub('(? bar()) 353 | # So we handle that special case here. 354 | if self._special_eol_re.search(content) is not None: 355 | content = self._collapse_nonblock_statement(content) 356 | 357 | # Turn any 2 line conditional into a block as well, which is 358 | # if (foo) 359 | # bar(); 360 | # becomes 361 | # if (foo) 362 | # { 363 | # bar(); 364 | # } 365 | content = re.sub("^(\s*\w+\s*\(.*\)(?!;))\s(.+;)$", r'\1\n{\n\2\n}', 366 | content, flags=re.MULTILINE) 367 | 368 | # And now the same thing above, except for else, cause it's special 369 | content = re.sub("^(.*else)\s(.+;)$", r'\1\n{\n\2\n}', content, 370 | flags=re.MULTILINE) 371 | 372 | content = re.sub("EXPORT_SYMBOL.*$", '', content, flags=re.MULTILINE) 373 | content = self._expand_syscalls(content) 374 | 375 | content.strip() 376 | 377 | content = self._make_pretty(content) 378 | 379 | print("content is '{}'".format(content)) 380 | for line in content.split('\n'): 381 | buf += line + "\n" 382 | 383 | if self._statement_re.match(buf) is None: 384 | continue 385 | 386 | if self._handle_function_def(ft, buf): 387 | buf = "" 388 | continue 389 | if self.cur_function != None: 390 | self.cur_function.add_content(buf) 391 | self._handle_function_call(ft, buf) 392 | self._handle_block(buf) 393 | buf = "" 394 | 395 | def parse_path(self, path, ft): 396 | if os.path.isdir(path): 397 | for f in os.listdir(path): 398 | self.parse_path(os.path.join(path, f), ft) 399 | elif os.path.isfile(path): 400 | if path.endswith('.c') or path.endswith('.h'): 401 | infile = open(path) 402 | self.parse_file(infile, ft) 403 | infile.close() 404 | 405 | if __name__ == "__main__": 406 | p = FileParser() 407 | ft = FunctionTree() 408 | p.parse_path("fs/xfs/xfs_buf.c", ft) 409 | print(ft.functions.keys()) 410 | -------------------------------------------------------------------------------- /kernelparse/printpaths.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from kernelparse import FileParser,FunctionTree 3 | import os 4 | import argparse 5 | 6 | def find_all_paths(func, destination, path, visited): 7 | path = path + [func.name] 8 | if func.name == destination: 9 | return [path] 10 | visited.append(func.name) 11 | if len(func.calls) == 0: 12 | return [] 13 | paths = [] 14 | for c in func.calls.keys(): 15 | if c in visited: 16 | continue 17 | newpaths = find_all_paths(func.calls[c]['func'], destination, path, visited) 18 | for newpath in newpaths: 19 | paths.append(newpath) 20 | return paths 21 | 22 | parser = argparse.ArgumentParser(description="Find paths between two functions") 23 | parser.add_argument("-d", "--destination", help="Destination function") 24 | parser.add_argument("-s", "--source", help="Source function") 25 | parser.add_argument("directory", help="The directory to search") 26 | args = parser.parse_args() 27 | 28 | if not args.source or not args.destination: 29 | print("You must specify a source and destination") 30 | exit(1) 31 | 32 | p = FileParser() 33 | ft = FunctionTree() 34 | 35 | p.parse_path(args.directory, ft) 36 | 37 | func = ft.functions[args.source] 38 | paths = find_all_paths(func, args.destination, [], []) 39 | psorted = sorted(paths, key=lambda x:len(x)) 40 | for i in psorted: 41 | print(i) 42 | -------------------------------------------------------------------------------- /kernelparse/test-parse.py: -------------------------------------------------------------------------------- 1 | from kernelparse import FileParser,FunctionTree 2 | 3 | testfile = open("test.c") 4 | p = FileParser() 5 | ft = FunctionTree() 6 | p.parse_file(testfile, ft) 7 | 8 | funcs = { 9 | 'box' : 'int box(void)', 10 | 'bean': 'int bean(void)', 11 | 'boo' : 'int boo(int a, int b)', 12 | 'baz' : 'int baz(int a)', 13 | 'foo' : 'int foo(int a)', 14 | 'duper' : 'int duper(void *obnoxious, int we)', 15 | 'comment_in_front' : 'int comment_in_front(void)', 16 | 'multiline_comment_in_front' : 'int multiline_comment_in_front(void)', 17 | 'funky' : 'int funky(char *foo)', 18 | 'recurse' : 'int recurse(int a)', 19 | 'multiline_if' : 'int multiline_if(void)', 20 | 'multiline_if_2' : 'int multiline_if_2(void)', 21 | 'main' : 'int main(int argc, char **argv)', 22 | 'pointer' : 'int pointer(void *blah)', 23 | 'ifcall' : 'int ifcall(void)', 24 | 'weird_stuff': 'int weird_stuff(int a, int b)', 25 | 'messed_up': 'int messed_up(void)'} 26 | 27 | for name in funcs.keys(): 28 | if name not in ft.functions: 29 | print("FAILED: {} not found".format(name)) 30 | exit(1) 31 | if not ft.functions[name].defined: 32 | print("FAILED: {} definition wasn't found, have {}".format(name, 33 | ft.functions.keys())) 34 | exit(1) 35 | if ft.functions[name].definition != funcs[name]: 36 | print("FAILED: {} definition '{}' doesn't match '{}'".format(name, 37 | ft.functions[name].definition, funcs[name])) 38 | exit(1) 39 | print("PASSED: basic checks") 40 | 41 | func = ft.functions['main'] 42 | content = """ int i = 0; 43 | if (foo(bar()) > baz(boo(bean(), box()))) 44 | { 45 | return 1; 46 | } 47 | if (multiline_if() > multiline_if_2()) 48 | { 49 | return 0; 50 | } 51 | if (i == 1) 52 | { 53 | ifcall(); 54 | } 55 | if (multiline_if() > multiline_if_2()) 56 | { 57 | return 0; 58 | } 59 | funky(STRING); 60 | boo(1, 2); 61 | pointer(&some->weirdness); 62 | if (i == 1) 63 | { 64 | ifcall(); 65 | } 66 | i = (1 + 1) \ 2; 67 | do { 68 | boo(1, 2); 69 | } while (i++ < 10); 70 | if (i == 1) 71 | { 72 | boo(2, 1); 73 | } 74 | else 75 | { 76 | boo(1, 2); 77 | } 78 | return 0;""" 79 | 80 | if func.content != content: 81 | print("FAILED: the content didn't match!") 82 | print("'{}'".format(func.content)) 83 | exit(1) 84 | 85 | calls = ['foo', 'bar', 'baz', 'boo', 'bean', 'box', 'multiline_if', 86 | 'multiline_if_2', 'funky', 'pointer', 'ifcall'] 87 | if set(calls) != set(func.calls.keys()): 88 | print("FAILED: didn't find all the calls".format(func.calls.keys())) 89 | print("Missing '{}'".format(list(set(calls) - set(func.calls.keys())))) 90 | print("Extra '{}'".format(list(set(func.calls.keys()) - set(calls)))) 91 | exit(1) 92 | if len(calls) != len(func.calls.keys()): 93 | print("FAILED: too many calls {}".format(func.calls.keys())) 94 | exit(1) 95 | print("PASSED: call checks") 96 | 97 | valid_args = { 'foo' : ['bar()'], 98 | 'bar' : [''], 99 | 'baz' : ['boo(bean(), box())'], 100 | 'boo' : ['bean(), box()', '1, 2', '2, 1'], 101 | 'bean' : [''], 102 | 'box' : [''], 103 | 'funky' : ['STRING'], 104 | 'multiline_if' : [''], 105 | 'multiline_if_2' : [''], 106 | 'pointer' : ['&some->weirdness'], 107 | 'ifcall' : ['']} 108 | 109 | for c in func.calls.keys(): 110 | call = func.calls[c] 111 | name = call['func'].name 112 | if name not in valid_args.keys(): 113 | print("FAILED: {} not in the valid_args list".format(name)) 114 | exit(1) 115 | if set(call['args']) != set(valid_args[name]): 116 | print("FAILED: {} call did not have the right args".format(name)) 117 | print("call args {}".format(call['args'])) 118 | print("valid args {}".format(valid_args[name])) 119 | exit(1) 120 | print("PASSED: args checks") 121 | -------------------------------------------------------------------------------- /kernelparse/test.c: -------------------------------------------------------------------------------- 1 | int box(void) 2 | { 3 | return 3; 4 | } 5 | 6 | SYSCALL_DEFINE(weird_stuff, int, a, int, b) 7 | { 8 | box(); 9 | return 4; 10 | } 11 | 12 | int messed_up(void) 13 | __releases(&someshit) 14 | __acquires(&someothershit) 15 | { 16 | hooray(); 17 | foo: 18 | something(); /* als;dkjfal;sdjf */ 19 | bar: 20 | something_else(); 21 | return 0; 22 | } 23 | 24 | int bean(void) 25 | { 26 | return 2; 27 | } 28 | 29 | int boo(int a, int b) 30 | { 31 | return a > b; 32 | } 33 | 34 | /* 35 | * bing bang boom 36 | */ 37 | int baz(int a) 38 | { 39 | return 1; 40 | } 41 | 42 | int foo(int a) 43 | { /* foo bar */ 44 | return a; 45 | } 46 | 47 | #define some_long_macro() \ 48 | foo() 49 | 50 | struct abc { 51 | int a; 52 | u64 b; 53 | }; 54 | 55 | /* 56 | * Just to be super 57 | */ 58 | int 59 | duper( 60 | void *obnoxious, /* because */ 61 | int we) /* can 62 | be 63 | really 64 | bad 65 | */ 66 | { 67 | return 2; 68 | } 69 | 70 | /* comment in front */int comment_in_front(void) 71 | { 72 | return 1; 73 | } 74 | 75 | typedef struct foo_s; 76 | 77 | struct foo_r { 78 | foo_s (*call)(int b); 79 | }; 80 | 81 | /* multiline 82 | * comment 83 | * in 84 | * front*/int multiline_comment_in_front(void) 85 | { 86 | return 1; 87 | } 88 | 89 | int funky(char *foo) 90 | { 91 | return 1; 92 | } 93 | 94 | int recurse(int a) 95 | { 96 | if (++a < 10) 97 | return recurse(a); 98 | return a; 99 | } 100 | 101 | int multiline_if(void) 102 | { 103 | return 2; 104 | } 105 | 106 | int multiline_if_2(void) 107 | { 108 | return 3; 109 | } 110 | 111 | int pointer(void *blah) 112 | { 113 | return 2; 114 | } 115 | 116 | int ifcall(void) 117 | { 118 | return 1; 119 | } 120 | 121 | int main(int argc, char **argv) 122 | { 123 | int i = 0; 124 | /* 125 | * a multiline comment to make(sure) 126 | * we don't accidentally grab(these as fucntions) 127 | * or ignore stuff; 128 | */ 129 | if (foo(bar()) > baz(boo(bean(), box()))) 130 | return 1; 131 | if (multiline_if() > 132 | multiline_if_2()) 133 | return 0; 134 | 135 | if (i == 1) 136 | ifcall(); 137 | 138 | /* This is for the content stuff to make sure it all ends up on the same 139 | * line. 140 | */ 141 | if (multiline_if() 142 | > multiline_if_2()) 143 | return 0; 144 | funky("blahblah(boo)"); 145 | boo(1, 2); 146 | pointer(&some->weirdness); 147 | 148 | if (i == 1) ifcall(); 149 | 150 | i = (1 + 1) \ 151 | 2; 152 | 153 | do { 154 | boo(1, 2); 155 | } while (i++ < 10); 156 | 157 | if (i == 1) 158 | boo(2, 1); 159 | else 160 | boo(1, 2); 161 | return 0; 162 | } 163 | -------------------------------------------------------------------------------- /kswapd-work.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | from time import sleep 3 | import signal 4 | 5 | def signal_ignore(signal, frame): 6 | print() 7 | 8 | class SignalInterrupt(Exception): 9 | def __init__(self, message): 10 | super(SignalInterrupt, self).__init__(message) 11 | 12 | def signal_stop(signal, frame): 13 | raise SignalInterrupt("Interrupted!") 14 | 15 | bpf_text = """ 16 | #include 17 | #include 18 | 19 | #define SCANNED_ID 1 20 | #define RECLAIMED_ID 2 21 | #define WAKEUPS 3 22 | 23 | BPF_HASH(counts, int); 24 | 25 | /* We use vmpressure because struct scan_control is internal to vmscan.c, so we 26 | * use vmpressure as an analog. 27 | */ 28 | int trace_vmpressure(struct pt_regs *ctx, gfp_t gfp, struct mem_cgroup *memcg, 29 | bool tree, unsigned long scanned, unsigned long reclaimed) 30 | { 31 | int id; 32 | u64 zero = 0, *val; 33 | 34 | id = SCANNED_ID; 35 | val = counts.lookup_or_init(&id, &zero); 36 | (*val) += scanned; 37 | id = RECLAIMED_ID; 38 | val = counts.lookup_or_init(&id, &zero); 39 | (*val) += reclaimed; 40 | return 0; 41 | } 42 | 43 | /* We can hit this via direct reclaim, but my test cases never hit direct 44 | * reclaim, so I'm taking the easy way out. 45 | */ 46 | int trace_shrink_node(struct pt_regs *ctx) 47 | { 48 | int id = WAKEUPS; 49 | u64 zero = 0, *val; 50 | val = counts.lookup_or_init(&id, &zero); 51 | (*val)++; 52 | return 0; 53 | } 54 | """ 55 | 56 | b = BPF(text=bpf_text) 57 | b.attach_kprobe(event="vmpressure", fn_name="trace_vmpressure") 58 | b.attach_kprobe(event="shrink_node", fn_name="trace_shrink_node") 59 | 60 | print("Tracing, hit Ctrl+C to exit") 61 | signal.signal(signal.SIGINT, signal_stop) 62 | try: 63 | sleep(99999999) 64 | except SignalInterrupt: 65 | signal.signal(signal.SIGINT, signal_ignore) 66 | except KeyboardInterrupt: 67 | signal.signal(signal.SIGINT, signal_ignore) 68 | 69 | counts = b.get_table("counts") 70 | scanned = 0 71 | reclaimed = 0 72 | wakeups = 0 73 | for k,v in counts.items(): 74 | if k.value == 1: 75 | scanned = v.value 76 | if k.value == 2: 77 | reclaimed = v.value 78 | if k.value == 3: 79 | wakeups = v.value 80 | 81 | print("Total wake ups: {}".format(wakeups)) 82 | print("Total scanned: {}".format(scanned)) 83 | print("Total reclaimed: {}".format(reclaimed)) 84 | if wakeups > 0: 85 | print("Avg scanned per run: {}".format(float(scanned) / wakeups)) 86 | print("Avg reclaimed per run: {}".format(float(reclaimed) / wakeups)) 87 | -------------------------------------------------------------------------------- /mm-drgn-helpers.py: -------------------------------------------------------------------------------- 1 | from drgn import FaultError 2 | 3 | def bio_for_each_bvec(prog, bio): 4 | for idx in range(0, bio.bi_vcnt): 5 | yield bio.bi_io_vec[idx] 6 | 7 | def find_slab(name): 8 | for s in list_for_each_entry("struct kmem_cache", prog['slab_caches'].address_of_(), 'list'): 9 | if s.name.string_().decode("utf-8") == name: 10 | return s 11 | 12 | def dump_slabs(): 13 | for s in list_for_each_entry("struct kmem_cache", prog['slab_caches'].address_of_(), "list"): 14 | print("{} {}".format(s.name.string_().decode("utf-8"), hex(s.value_()))) 15 | 16 | def _slub_page_objects(prog, slab, page, obj_type): 17 | addr = page_to_virt(page).value_() 18 | addr += slab.red_left_pad 19 | ret = [] 20 | end = addr + slab.size * page.objects 21 | while addr < end: 22 | ret.append(Object(prog, obj_type, address=addr)) 23 | addr += slab.size 24 | return ret 25 | 26 | def slab_page_objects(prog, slab, page, obj_type): 27 | try: 28 | return _slub_page_objects(prog, slab, page, obj_type) 29 | except AttributeError: 30 | pass 31 | ret = [] 32 | offset = 0 33 | if prog.type('struct kmem_cache').has_member('obj_offset'): 34 | offset = slab.obj_offset 35 | for i in range(0, slab.num): 36 | addr = page.s_mem.value_() + i * slab.size + offset 37 | ret.append(Object(prog, obj_type, address=addr)) 38 | return ret 39 | 40 | def for_each_slab_page(prog): 41 | PGSlab = 1 << prog.constant('PG_slab') 42 | for p in for_each_page(prog): 43 | try: 44 | if p.flags.value_() & PGSlab: 45 | yield p 46 | except FaultError: 47 | pass 48 | 49 | def dump_slab_objects(prog, slab, obj_type): 50 | ret = [] 51 | for p in for_each_slab_page(prog): 52 | if p.slab_cache == slab: 53 | ret.extend(slab_page_objects(prog, slab, p, obj_type)) 54 | return ret 55 | -------------------------------------------------------------------------------- /old-socket-debug.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | import ctypes as ct 3 | 4 | b = BPF(text=""" 5 | #include 6 | #include 7 | #include 8 | 9 | #define OP_NAME_LEN 32 10 | 11 | typedef struct sock_data_s { 12 | u64 pid; 13 | u64 port; 14 | u64 bytes; 15 | u64 time; 16 | char opname[OP_NAME_LEN]; 17 | } sock_data_t; 18 | 19 | BPF_HASH(holders, struct socket *); 20 | BPF_HASH(files, struct file *); 21 | BPF_HASH(pids, u64); 22 | BPF_PERF_OUTPUT(sends); 23 | BPF_PERF_OUTPUT(accepts); 24 | BPF_HASH(stack_hash, u64); 25 | BPF_HASH(ops, u64, sock_data_t); 26 | BPF_STACK_TRACE(stack_traces, 1024); 27 | 28 | int trace_inet_stream_connect(struct pt_regs *ctx, struct socket *socket, 29 | struct sockaddr *uaddr) 30 | { 31 | struct sockaddr_in *saddr = (struct sockaddr_in *)uaddr; 32 | u16 port = saddr->sin_port; 33 | port = ntohs(port); 34 | 35 | if (port == 0xcea) { 36 | u64 tmp = 12345; 37 | holders.update(&socket, &tmp); 38 | sock_data_t data = { 39 | .pid = bpf_get_current_pid_tgid(), 40 | .port = port, 41 | .bytes = 0, 42 | .time = bpf_ktime_get_ns(), 43 | .opname = "connect", 44 | }; 45 | sends.perf_submit(ctx, &data, sizeof(data)); 46 | } 47 | return 0; 48 | } 49 | 50 | int trace_sk_filter(struct pt_regs *ctx, struct sock *sk) 51 | { 52 | struct socket *socket = sk->sk_socket; 53 | u64 *tmp; 54 | 55 | tmp = holders.lookup(&socket); 56 | if (!tmp) 57 | return 0; 58 | u64 pid = bpf_get_current_pid_tgid(); 59 | u64 blah = 12345; 60 | pids.update(&pid, &blah); 61 | return 0; 62 | } 63 | 64 | int trace_sk_filter_ret(struct pt_regs *ctx) 65 | { 66 | u64 *tmp; 67 | u64 ret = PT_REGS_RC(ctx); 68 | u64 pid = bpf_get_current_pid_tgid(); 69 | 70 | tmp = pids.lookup(&pid); 71 | if (!tmp) 72 | return 0; 73 | sock_data_t data = { 74 | .pid = pid, 75 | .port = 0, 76 | .bytes = ret, 77 | .time = bpf_ktime_get_ns(), 78 | .opname = "FUCKED", 79 | }; 80 | sends.perf_submit(ctx, &data, sizeof(data)); 81 | pids.delete(&pid); 82 | return 0; 83 | } 84 | 85 | int trace_inet_accept(struct pt_regs *ctx, struct socket *socket, 86 | struct socket *newsock) 87 | { 88 | struct file *file = newsock->file; 89 | u16 port = socket->sk->__sk_common.skc_num; 90 | u16 newport = newsock->sk->__sk_common.skc_dport; 91 | 92 | if (port == 0xcea) { 93 | u64 tmp = 12345; 94 | u64 pid = bpf_get_current_pid_tgid(); 95 | holders.update(&newsock, &tmp); 96 | holders.update(&socket, &tmp); 97 | files.update(&file, &tmp); 98 | // pids.update(&pid, &tmp); 99 | sock_data_t data = { 100 | .pid = pid, 101 | .port = newport, 102 | .bytes = 0, 103 | .time = bpf_ktime_get_ns(), 104 | .opname = "accept", 105 | }; 106 | sends.perf_submit(ctx, &data, sizeof(data)); 107 | // accepts.perf_submit(ctx, &pid, sizeof(pid)); 108 | } 109 | return 0; 110 | } 111 | 112 | int trace_sock_sendmsg(struct pt_regs *ctx, struct socket *socket) 113 | { 114 | u64 blah = 12345; 115 | u64 *tmp; 116 | u16 port = socket->sk->__sk_common.skc_dport; 117 | 118 | tmp = holders.lookup(&socket); 119 | if (!tmp) 120 | return 0; 121 | sock_data_t data = { 122 | .pid = bpf_get_current_pid_tgid(), 123 | .port = port, 124 | .opname = "sendmsg", 125 | }; 126 | // pids.update(&data.pid, &blah); 127 | ops.update(&data.pid, &data); 128 | return 0; 129 | } 130 | 131 | int trace_sock_recvmsg(struct pt_regs *ctx, struct socket *socket) 132 | { 133 | u64 pid = bpf_get_current_pid_tgid(); 134 | u64 *tmp, *blah; 135 | u16 port = socket->sk->__sk_common.skc_dport; 136 | 137 | /* 138 | blah = pids.lookup(&pid); 139 | if (!blah) 140 | return 0; 141 | 142 | accepts.perf_submit(ctx, &pid, sizeof(pid)); 143 | */ 144 | tmp = holders.lookup(&socket); 145 | if (!tmp) 146 | return 0; 147 | 148 | sock_data_t data = { 149 | .pid = bpf_get_current_pid_tgid(), 150 | .port = port, 151 | .opname = "recvmsg", 152 | }; 153 | ops.update(&data.pid, &data); 154 | return 0; 155 | } 156 | 157 | int trace_sock_op_ret(struct pt_regs *ctx) 158 | { 159 | u64 bytes = PT_REGS_RC(ctx); 160 | u64 pid = bpf_get_current_pid_tgid(); 161 | sock_data_t *data; 162 | 163 | data = ops.lookup(&pid); 164 | if (!data) 165 | return 0; 166 | data->bytes = bytes; 167 | data->time = bpf_ktime_get_ns(); 168 | sends.perf_submit(ctx, data, sizeof(sock_data_t)); 169 | ops.delete(&pid); 170 | return 0; 171 | } 172 | 173 | int trace_sock_op_ret_recv(struct pt_regs *ctx) 174 | { 175 | u64 bytes = PT_REGS_RC(ctx); 176 | u64 pid = bpf_get_current_pid_tgid(); 177 | sock_data_t *data; 178 | 179 | data = ops.lookup(&pid); 180 | if (!data) 181 | return 0; 182 | data->bytes = bytes; 183 | data->time = bpf_ktime_get_ns(); 184 | sends.perf_submit(ctx, data, sizeof(sock_data_t)); 185 | ops.delete(&pid); 186 | return 0; 187 | } 188 | /* 189 | int trace_sk_method(struct pt_regs *ctx, struct sock *sk) 190 | { 191 | u64 *tmp; 192 | 193 | tmp = holders.lookup(&sk); 194 | if (!tmp) 195 | return 0; 196 | u64 pid = bpf_get_current_pid_tgid(); 197 | sends.perf_submit(ctx, &pid, sizeof(pid)); 198 | holders.delete(&sk); 199 | return 0; 200 | } 201 | int trace_fdget(struct pt_regs *ctx) 202 | { 203 | unsigned long v = (unsigned long)PT_REGS_RC(ctx); 204 | struct file *file = (struct file *)(v & ~3); 205 | u64 *tmp; 206 | 207 | tmp = files.lookup(&file); 208 | if (!tmp) 209 | return 0; 210 | u64 pid = bpf_get_current_pid_tgid(); 211 | u64 stackid = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); 212 | u64 *val; 213 | u64 zero = 0; 214 | 215 | val = stack_hash.lookup_or_init(&stackid, &zero); 216 | (*val)++; 217 | return 0; 218 | } 219 | int trace_recvfrom_ret(struct pt_regs *ctx) 220 | { 221 | u64 ret = PT_REGS_RC(ctx); 222 | u64 pid = bpf_get_current_pid_tgid(); 223 | u64 *tmp; 224 | 225 | tmp = pids.lookup(&pid); 226 | if (!tmp) 227 | return 0; 228 | accepts.perf_submit(ctx, &ret, sizeof(ret)); 229 | return 0; 230 | } 231 | */ 232 | """) 233 | b.attach_kprobe(event="inet_accept", fn_name="trace_inet_accept") 234 | b.attach_kprobe(event="sock_sendmsg", fn_name="trace_sock_sendmsg") 235 | b.attach_kprobe(event="inet_recvmsg", fn_name="trace_sock_recvmsg") 236 | b.attach_kprobe(event="inet_stream_connect", fn_name="trace_inet_stream_connect") 237 | b.attach_kretprobe(event="sock_sendmsg", fn_name="trace_sock_op_ret") 238 | b.attach_kretprobe(event="inet_recvmsg", fn_name="trace_sock_op_ret_recv") 239 | b.attach_kprobe(event="sk_filter", fn_name="trace_sk_filter") 240 | b.attach_kretprobe(event="sk_filter", fn_name="trace_sk_filter_ret") 241 | #b.attach_kprobe(event="tcp_setsockopt", fn_name="trace_sk_method") 242 | #b.attach_kprobe(event="tcp_close", fn_name="trace_sk_method") 243 | #b.attach_kretprobe(event="__fdget", fn_name="trace_fdget") 244 | #b.attach_kretprobe(event="SyS_recvfrom", fn_name="trace_recvfrom_ret") 245 | 246 | class Data(ct.Structure): 247 | _fields_ = [ 248 | ("pid", ct.c_ulonglong), 249 | ("port", ct.c_ulonglong), 250 | ("bytes", ct.c_ulonglong), 251 | ("time", ct.c_ulonglong), 252 | ("opname", ct.c_char * 32), 253 | ] 254 | 255 | def print_pid(cpu, data, size): 256 | event = ct.cast(data, ct.POINTER(Data)).contents 257 | print("%s pid %d tgid %d did %s on port %s with %s bytes" % (event.time, event.pid >> 32, 258 | event.pid & ((1 << 32)-1), event.opname, event.port, event.bytes)) 259 | 260 | #def print_accept(cpu, data, size): 261 | # event = ct.cast(data, ct.POINTER(ct.c_ulonglong)).contents 262 | # print("pid %d tgid %d is accepted" % (event.value >> 32, event.value & ((1 << 32)-1))) 263 | 264 | def print_accept(cpu, data, size): 265 | event = ct.cast(data, ct.POINTER(ct.c_ulonglong)).contents 266 | print("recvfrom ret %s" % (event.value)) 267 | 268 | b["accepts"].open_perf_buffer(print_accept) 269 | b["sends"].open_perf_buffer(print_pid) 270 | stack_traces = b.get_table("stack_traces") 271 | stack_hash = b.get_table("stack_hash") 272 | while 1: 273 | b.kprobe_poll() 274 | for k,v in stack_hash.items(): 275 | stack = stack_traces.walk(k.value) 276 | for addr in stack: 277 | print(" %s" % b.ksym(addr)) 278 | print("\n") 279 | stack_hash.clear() 280 | -------------------------------------------------------------------------------- /orphans.btrd: -------------------------------------------------------------------------------- 1 | filesystem "/"; 2 | 3 | k = key(0, BTRFS_ROOT_ITEM_KEY, 0, 0); 4 | roots = search(BTRFS_ROOT_TREE_OBJECTID, k); 5 | 6 | for r in roots { 7 | rkey = keyof(r); 8 | if rkey.type != BTRFS_ROOT_ITEM_KEY { 9 | continue; 10 | } 11 | if rkey.objectid !=5 && (rkey.objectid < BTRFS_FIRST_FREE_OBJECTID || rkey.objectid > BTRFS_LAST_FREE_OBJECTID) { 12 | continue; 13 | } 14 | 15 | total = 0; 16 | 17 | k = key(BTRFS_ORPHAN_OBJECTID, BTRFS_ORPHAN_ITEM_KEY, 0, 0); 18 | k.max_type = BTRFS_ORPHAN_ITEM_KEY; 19 | orphans = search(rkey.objectid, k); 20 | 21 | for o in orphans { 22 | okey = keyof(o); 23 | 24 | if okey.type != BTRFS_ORPHAN_ITEM_KEY { 25 | continue; 26 | } 27 | 28 | k2 = key(okey.offset, BTRFS_INODE_ITEM_KEY, 0, 0); 29 | k2.max_objectid = okey.offset; 30 | k2.max_type = BTRFS_INODE_ITEM_KEY; 31 | 32 | inodes = search(257, k2); 33 | for i in inodes { 34 | ikey = keyof(i); 35 | if ikey.objectid != okey.offset { 36 | break; 37 | } 38 | 39 | if (ikey.type != BTRFS_INODE_ITEM_KEY) { 40 | break; 41 | } 42 | 43 | total += i.nbytes; 44 | break; 45 | } 46 | } 47 | 48 | if total > 0 { 49 | print "root " + str(rkey.objectid) + " bytes " + str(total); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /read-pattern.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from bcc import BPF 3 | import ctypes as ct 4 | 5 | debug = 0 6 | 7 | bpf_text = """ 8 | #include 9 | #include 10 | #include 11 | 12 | typedef struct read_data_s { 13 | u64 pos; 14 | u64 count; 15 | char name[32]; 16 | } read_data_t; 17 | 18 | BPF_PERF_OUTPUT(reads); 19 | 20 | int trace_generic_file_read_iter(struct pt_regs *ctx, struct kiocb *iocb, struct iov_iter *i) 21 | { 22 | u64 magic = iocb->ki_filp->f_mapping->host->i_sb->s_magic; 23 | if (magic != 0x58465342) 24 | return 0; 25 | u64 count = i->count; 26 | u64 pos = iocb->ki_pos; 27 | struct dentry *dentry = iocb->ki_filp->f_path.dentry; 28 | 29 | read_data_t data = { 30 | .count = count, 31 | .pos = pos, 32 | }; 33 | bpf_probe_read(&data.name, sizeof(data.name), (void *)dentry->d_name.name); 34 | reads.perf_submit(ctx, &data, sizeof(data)); 35 | return 0; 36 | } 37 | """ 38 | b = BPF(text=bpf_text) 39 | b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_generic_file_read_iter") 40 | 41 | class ReadData(ct.Structure): 42 | _fields_ = [ 43 | ("pos", ct.c_ulonglong), 44 | ("count", ct.c_ulonglong), 45 | ("name", ct.c_char * 32), 46 | ] 47 | 48 | files = {} 49 | 50 | def print_read_data(cpu, data, size): 51 | event = ct.cast(data, ct.POINTER(ReadData)).contents 52 | if event.name not in files: 53 | files[event.name] = [] 54 | l = [ {'pos': int(event.pos), 'count': int(event.count)} ] 55 | files[event.name].extend(l) 56 | 57 | count = 0 58 | b['reads'].open_perf_buffer(print_read_data) 59 | while 1: 60 | b.kprobe_poll() 61 | count += 1 62 | if count > 100: 63 | break 64 | 65 | print("Checking for overlapping areas") 66 | for f in files.keys(): 67 | pos = [] 68 | lens = [] 69 | for l in files[f]: 70 | pos.append(l['pos']) 71 | lens.append(l['count']) 72 | for i in range(0, len(pos)): 73 | cur_pos = pos[i] 74 | cur_len = lens[i] 75 | for c in range(i+1, len(pos)): 76 | test_pos = pos[c] 77 | test_len = lens[c] 78 | if cur_pos >= (test_pos + test_len) or test_pos >= (cur_pos + cur_len): 79 | continue 80 | print("OVERLAP file %s, %d-%d %d-%d" % (f, cur_pos, cur_len, test_pos, test_len)) 81 | 82 | -------------------------------------------------------------------------------- /referenced-objects.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | from time import sleep 3 | import signal 4 | 5 | def signal_ignore(signal, frame): 6 | print() 7 | 8 | class SignalInterrupt(Exception): 9 | def __init__(self, message): 10 | super(SignalInterrupt, self).__init__(message) 11 | 12 | def signal_stop(signal, frame): 13 | raise SignalInterrupt("Interrupted!") 14 | 15 | bpf_text = """ 16 | #include 17 | #include 18 | #include 19 | 20 | #define INODE_ID 1 21 | #define DENTRY_ID 2 22 | 23 | typedef struct dentry_storage_s { 24 | struct dentry *dentry; 25 | } dentry_storage_t; 26 | 27 | typedef struct inode_storage_s { 28 | struct inode *inode; 29 | } inode_storage_t; 30 | 31 | BPF_HASH(dentries, u64, dentry_storage_t); 32 | BPF_HASH(inodes, u64, inode_storage_t); 33 | BPF_HASH(referenced, u64); 34 | 35 | static int inc_referenced(u64 id) 36 | { 37 | u64 *val, zero = 0; 38 | val = referenced.lookup_or_init(&id, &zero); 39 | lock_xadd(val, 1); 40 | return 0; 41 | } 42 | 43 | int trace_dentry_lru_add(struct pt_regs *ctx, struct dentry *dentry) 44 | { 45 | u64 pid = bpf_get_current_pid_tgid(); 46 | if (dentry->d_flags & DCACHE_REFERENCED) 47 | return 0; 48 | dentry_storage_t data = { 49 | .dentry = dentry, 50 | }; 51 | dentries.update(&pid, &data); 52 | return 0; 53 | } 54 | 55 | int trace_dentry_lru_add_ret(struct pt_regs *ctx) 56 | { 57 | u64 pid = bpf_get_current_pid_tgid(); 58 | unsigned int flags; 59 | dentry_storage_t *data; 60 | 61 | data = dentries.lookup(&pid); 62 | if (!data) 63 | return 0; 64 | bpf_probe_read(&flags, sizeof(unsigned int), &data->dentry->d_flags); 65 | if (flags & DCACHE_REFERENCED) 66 | inc_referenced(DENTRY_ID); 67 | dentries.delete(&pid); 68 | return 0; 69 | } 70 | 71 | int trace_inode_lru_list_add(struct pt_regs *ctx, struct inode *inode) 72 | { 73 | u64 pid = bpf_get_current_pid_tgid(); 74 | if (inode->i_state & I_REFERENCED) 75 | return 0; 76 | inode_storage_t data = { 77 | .inode = inode, 78 | }; 79 | inodes.update(&pid, &data); 80 | return 0; 81 | } 82 | 83 | int trace_inode_lru_list_add_ret(struct pt_regs *ctx) 84 | { 85 | u64 pid = bpf_get_current_pid_tgid(); 86 | unsigned long state; 87 | inode_storage_t *data; 88 | 89 | data = inodes.lookup(&pid); 90 | if (!data) 91 | return 0; 92 | bpf_probe_read(&state, sizeof(unsigned long), &data->inode->i_state); 93 | if (state & I_REFERENCED) 94 | inc_referenced(INODE_ID); 95 | inodes.delete(&pid); 96 | return 0; 97 | } 98 | """ 99 | 100 | b = BPF(text=bpf_text) 101 | b.attach_kprobe(event="inode_lru_list_add", fn_name="trace_inode_lru_list_add") 102 | b.attach_kretprobe(event="inode_lru_list_add", fn_name="trace_inode_lru_list_add_ret") 103 | b.attach_kprobe(event="dentry_lru_add", fn_name="trace_dentry_lru_add") 104 | b.attach_kretprobe(event="dentry_lru_add", fn_name="trace_dentry_lru_add_ret") 105 | 106 | print("Tracing, hit Ctrl+C to exit") 107 | signal.signal(signal.SIGINT, signal_stop) 108 | try: 109 | sleep(99999999) 110 | except KeyboardInterrupt: 111 | signal.signal(signal.SIGINT, signal_ignore) 112 | except SignalInterrupt: 113 | signal.signal(signal.SIGINT, signal_ignore) 114 | 115 | referenced_table = b.get_table("referenced") 116 | 117 | for k,v in referenced_table.items(): 118 | if k.value == 1: 119 | print("referenced inodes: {}".format(v.value)) 120 | else: 121 | print("referenced dentries: {}".format(v.value)) 122 | 123 | -------------------------------------------------------------------------------- /rq-latency-dist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from bcc import BPF 4 | import argparse 5 | from time import sleep 6 | 7 | bpf_text = """ 8 | #include 9 | #include 10 | 11 | typedef struct pid_key_s { 12 | u64 id; 13 | u64 slot; 14 | } pid_key_t; 15 | 16 | BPF_HASH(start, u64); 17 | STORAGE 18 | 19 | int woke(struct pt_regs *ctx, struct task_struct *p) 20 | { 21 | u32 pid = p->pid; 22 | u32 tgid = p->tgid; 23 | u64 key = (u64)tgid << 32 | pid; 24 | 25 | if (FILTER) 26 | return 0; 27 | u64 val = bpf_ktime_get_ns(); 28 | start.update(&key, &val); 29 | return 0; 30 | } 31 | 32 | int oncpu(struct pt_regs *ctx) 33 | { 34 | u64 key = bpf_get_current_pid_tgid(); 35 | u32 pid = key; 36 | u64 *tsp = start.lookup(&key); 37 | if (!tsp) 38 | return 0; 39 | u64 delta = bpf_ktime_get_ns() - *tsp; 40 | STORE 41 | return 0; 42 | } 43 | """ 44 | 45 | 46 | parser = argparse.ArgumentParser( 47 | description="Track the time processes spend on the runqueue before starting execution") 48 | parser.add_argument("-t", "--tgid", help="trace this TGID only") 49 | parser.add_argument("-p", "--pid", help="trace this PID only") 50 | parser.add_argument("-d", "--duration", nargs="?", default=9999999) 51 | args = parser.parse_args() 52 | 53 | section = "" 54 | 55 | if args.pid: 56 | bpf_text = bpf_text.replace('FILTER', "pid != {}".format(args.pid)) 57 | elif args.tgid: 58 | bpf_text = bpf_text.replace('FILTER', "tgid != {}".format(args.tgid)) 59 | else: 60 | bpf_text = bpf_text.replace('FILTER', '0') 61 | 62 | if args.pid or args.tgid: 63 | section = "pid" 64 | bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist, pid_key_t);') 65 | bpf_text = bpf_text.replace('STORE', 66 | 'pid_key_t pid_key = { .id = pid, .slot = bpf_log2l(delta)}; ' + 67 | 'dist.increment(pid_key);') 68 | else: 69 | bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);') 70 | bpf_text = bpf_text.replace('STORE', 'dist.increment(bpf_log2l(delta));') 71 | 72 | b = BPF(text=bpf_text) 73 | b.attach_kprobe(event='finish_task_switch', fn_name='oncpu') 74 | b.attach_kprobe(event='try_to_wake_up', fn_name='woke') 75 | 76 | print("Tracing") 77 | try: 78 | sleep(int(args.duration)) 79 | except KeyboardInterrupt: 80 | print("interrupted, dumping info") 81 | 82 | dist = b.get_table("dist") 83 | 84 | def pid_to_comm(pid): 85 | try: 86 | comm = open("/proc/%d/comm" % pid, "r").read() 87 | return "%d %s" % (pid, comm) 88 | except IOError: 89 | return str(pid) 90 | 91 | dist.print_log2_hist("nsecs", section, section_print_fn=pid_to_comm) 92 | -------------------------------------------------------------------------------- /sched-time.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from bcc import BPF 4 | import argparse 5 | from time import sleep 6 | import json 7 | import copy 8 | from collections import OrderedDict 9 | 10 | def print_tasks(tasks): 11 | last_tgid = 0 12 | for k,v in tasks: 13 | out_str = "Pid {} tgid {} runtime {} sleeptime {} iotime {} preemttime {}".format( 14 | v.pid, v.tgid, v.run_time, v.sleep_time, v.io_time, v.preempt_time) 15 | if last_tgid != v.tgid: 16 | print(out_str) 17 | last_tgid = v.tgid 18 | else: 19 | print("\t{}".format(out_str)) 20 | 21 | bpf_text = """ 22 | #include 23 | #include 24 | 25 | typedef struct val_s { 26 | u32 pid; 27 | u32 tgid; 28 | u64 run_time; 29 | u64 preempt_time; 30 | u64 sleep_time; 31 | u64 io_time; 32 | u64 run_events; 33 | u64 sleep_events; 34 | u32 short_lived; 35 | u32 priority; 36 | } val_t; 37 | 38 | typedef struct sleep_val_s { 39 | u64 ts; 40 | u64 state; 41 | } sleep_val_t; 42 | 43 | typedef struct wake_dep_s { 44 | u32 waker_pid; 45 | u32 sleeper_pid; 46 | u32 tgid; 47 | } wake_dep_t; 48 | 49 | BPF_HASH(tasks, u64, val_t); 50 | BPF_HASH(wake_deps, wake_dep_t); 51 | BPF_HASH(start, u64); 52 | BPF_HASH(end, u64, sleep_val_t); 53 | BPF_HASH(futexes, u64); 54 | 55 | int waker(struct pt_regs *ctx, struct task_struct *p) 56 | { 57 | u32 pid = p->pid; 58 | u32 tgid = p->tgid; 59 | 60 | if (!(PID_FILTER)) 61 | return 0; 62 | u64 pid_key = bpf_get_current_pid_tgid(); 63 | pid = pid_key; 64 | tgid = pid_key >> 32; 65 | if (tgid != p->tgid) 66 | return 0; 67 | if (!(PID_FILTER)) 68 | return 0; 69 | u64 *val = futexes.lookup(&pid_key); 70 | if (!val) 71 | return 0; 72 | wake_dep_t info = { 73 | .waker_pid = pid, 74 | .sleeper_pid = p->pid, 75 | .tgid = tgid, 76 | }; 77 | u64 zero = 0; 78 | val = wake_deps.lookup_or_init(&info, &zero); 79 | (*val)++; 80 | return 0; 81 | } 82 | 83 | int enter_futex(struct pt_regs *ctx) 84 | { 85 | u64 pid_key = bpf_get_current_pid_tgid(); 86 | u64 zero = 0; 87 | futexes.lookup_or_init(&pid_key, &zero); 88 | return 0; 89 | } 90 | 91 | int exit_futex(struct pt_regs *ctx) 92 | { 93 | u64 pid_key = bpf_get_current_pid_tgid(); 94 | futexes.delete(&pid_key); 95 | return 0; 96 | } 97 | 98 | int oncpu(struct pt_regs *ctx, struct task_struct *prev) 99 | { 100 | u64 pid_key = bpf_get_current_pid_tgid(); 101 | u32 pid = pid_key; 102 | u32 tgid = pid_key >> 32; 103 | u64 ts, *tsp; 104 | 105 | if (PID_FILTER) { 106 | sleep_val_t *sval; 107 | ts = bpf_ktime_get_ns(); 108 | start.update(&pid_key, &ts); 109 | val_t zero = { 110 | .pid = pid, 111 | .tgid = tgid, 112 | }; 113 | val_t *info = tasks.lookup_or_init(&pid_key, &zero); 114 | sval = end.lookup(&pid_key); 115 | if (sval) { 116 | u64 sleep_delta = ts - sval->ts; 117 | if (sval->state == TASK_RUNNING) 118 | info->preempt_time += sleep_delta; 119 | else if (sval->state & TASK_INTERRUPTIBLE) { 120 | info->run_events++; 121 | info->sleep_time += sleep_delta; 122 | } else if (sval->state & TASK_UNINTERRUPTIBLE) { 123 | info->run_events++; 124 | info->io_time += sleep_delta; 125 | } 126 | } 127 | end.delete(&pid_key); 128 | } 129 | 130 | pid = prev->pid; 131 | tgid = prev->tgid; 132 | pid_key = (u64)tgid << 32 | pid; 133 | 134 | if (!(PID_FILTER)) 135 | return 0; 136 | tsp = start.lookup(&pid_key); 137 | if (tsp) { 138 | u64 run_delta = bpf_ktime_get_ns() - *tsp; 139 | start.delete(&pid_key); 140 | val_t zero = { 141 | .pid = pid, 142 | .tgid = tgid, 143 | }; 144 | val_t *info = tasks.lookup_or_init(&pid_key, &zero); 145 | info->run_time += run_delta; 146 | info->priority = prev->prio; 147 | info->sleep_events++; 148 | } 149 | sleep_val_t sleep_val = { 150 | .ts = bpf_ktime_get_ns(), 151 | .state = prev->state, 152 | }; 153 | end.update(&pid_key, &sleep_val); 154 | return 0; 155 | } 156 | 157 | int trace_do_exit(struct pt_regs *ctx) 158 | { 159 | u64 pid = bpf_get_current_pid_tgid(); 160 | val_t *info = tasks.lookup(&pid); 161 | if (!info) 162 | return 0; 163 | u64 ts = bpf_ktime_get_ns(), *tsp; 164 | tsp = start.lookup(&pid); 165 | if (tsp) { 166 | u64 delta = ts - *tsp; 167 | info->run_time += delta; 168 | start.delete(&pid); 169 | } 170 | info->short_lived = 1; 171 | return 0; 172 | } 173 | 174 | """ 175 | 176 | parser = argparse.ArgumentParser(description="Summarize cpu usage of a task") 177 | parser.add_argument("--pids", metavar='P', type=int, nargs='+', 178 | help="List of pids to trace") 179 | parser.add_argument("--tgids", metavar='T', type=int, nargs='+', 180 | help="List of pids to trace") 181 | parser.add_argument("--duration", default=99999999, 182 | type=int, help="duration of trace, in seconds") 183 | parser.add_argument("--rtapp", type=bool, default=False, 184 | help="Output an rt-app config for the run") 185 | args = parser.parse_args() 186 | if not args.pids and not args.tgids: 187 | print("Must specify tgid's or pids") 188 | exit(1) 189 | if args.pids and args.tgids: 190 | print("Cannot specify tgid's and pidss") 191 | exit(1) 192 | duration = int(args.duration) 193 | filter_str = "" 194 | pids = [] 195 | tgids = [] 196 | if args.pids: 197 | pids = args.pids 198 | if args.tgids: 199 | tgids = args.tgids 200 | for p in pids: 201 | this_str = "pid == {}".format(p) 202 | if len(filter_str): 203 | filter_str += "|| {}".format(this_str) 204 | else: 205 | filter_str = this_str 206 | for p in tgids: 207 | this_str = "tgid == {}".format(p) 208 | if len(filter_str): 209 | filter_str += "|| {}".format(this_str) 210 | else: 211 | filter_str = this_str 212 | bpf_text = bpf_text.replace('PID_FILTER', filter_str) 213 | 214 | b = BPF(text=bpf_text) 215 | b.attach_kprobe(event="finish_task_switch", fn_name="oncpu") 216 | if args.rtapp: 217 | b.attach_kprobe(event="try_to_wake_up", fn_name="waker") 218 | b.attach_kprobe(event="do_futex", fn_name="enter_futex") 219 | b.attach_kretprobe(event="do_futex", fn_name="exit_futex") 220 | 221 | try: 222 | sleep(duration) 223 | except KeyboardInterrupt: 224 | pass 225 | 226 | tasks = b.get_table("tasks") 227 | sorted_tasks = sorted(tasks.items(), key=lambda run: run[1].tgid, reverse=True) 228 | if not args.rtapp: 229 | print_tasks(sorted_tasks) 230 | exit(0) 231 | 232 | waker_deps = b.get_table("wake_deps") 233 | waker_sets = {} 234 | for k,v in waker_deps.items(): 235 | waker = k.waker_pid 236 | sleeper = k.sleeper_pid 237 | # we add our waker to our list because consumers may wake producers to 238 | # indicate they have completed their task 239 | if waker not in waker_sets: 240 | waker_sets[waker] = set([sleeper]) 241 | elif sleeper not in waker_sets[waker]: 242 | waker_sets[waker].update([sleeper]) 243 | 244 | def reduce(waker_sets): 245 | need_loop = True 246 | groups = {} 247 | counter = 0 248 | while need_loop: 249 | need_loop = False 250 | producer = None 251 | for pid,wakeset in waker_sets.items(): 252 | found = False 253 | need_break = False 254 | for name,base in groups.items(): 255 | if wakeset.issubset(base): 256 | found = True 257 | break 258 | elif wakeset.issuperset(base): 259 | found = True 260 | groups[pid] = wakeset.copy() 261 | groups.pop(name, None) 262 | need_break = True 263 | break 264 | elif len(wakeset.intersection(base)): 265 | need_break = True 266 | waker_sets[pid] -= base 267 | break 268 | if need_break: 269 | need_loop = True 270 | break 271 | if not found: 272 | groups[pid] = wakeset.copy() 273 | need_loop = True 274 | return groups 275 | 276 | groups = {} 277 | loops = 0 278 | while True or loops > 10: 279 | loops += 1 280 | blah = reduce(waker_sets) 281 | if len(groups) != len(blah): 282 | groups = blah 283 | waker_sets = blah 284 | else: 285 | break 286 | 287 | for k,v in groups.items(): 288 | if len(v) == 1: 289 | groups.pop(k, None) 290 | 291 | last_tgid = 0 292 | threads_dict = {} 293 | global_dict = {"duration": args.duration} 294 | threads_list = [] 295 | for k,v in sorted_tasks: 296 | if last_tgid != v.tgid: 297 | if last_tgid != 0: 298 | for name,actions in threads_dict['tasks'].items(): 299 | if actions['instance'] > 1: 300 | actions['run'] /= actions['instance'] 301 | threads_list.append(copy.copy(threads_dict)) 302 | threads_dict = {} 303 | threads_dict["global"] = global_dict 304 | threads_dict["tasks"] = {} 305 | last_tgid = v.tgid 306 | total_time = 1000000 307 | runtime = v.run_time + v.preempt_time 308 | runevents = v.run_events 309 | sleeptime = v.sleep_time + v.io_time 310 | tdict = {} 311 | if v.pid in groups: 312 | tdict['loop'] = -1 313 | tdict['instance'] = 1 314 | if v.priority != 120: 315 | tdict['priority'] = v.priority - 120 316 | tdict['lock'] = 'mutex{}'.format(v.pid) 317 | tdict['broad'] = 'shared{}'.format(v.pid) 318 | tdict['unlock'] = 'mutex{}'.format(v.pid) 319 | tdict['sleep'] = 0 320 | threads_dict["tasks"][v.pid] = tdict 321 | 322 | found = False 323 | for pid,pidset in groups.items(): 324 | if v.pid in pidset: 325 | found = True 326 | name = "threads{}".format(pid) 327 | priority = 0 328 | if v.priority != 120: 329 | priority = v.priority - 120 330 | name = "threads{}priority{}".format(pid, priority) 331 | if name not in threads_dict["tasks"]: 332 | threads_dict["tasks"][name] = tdict 333 | tdict['instance'] = 0 334 | tdict['loop'] = -1 335 | if v.priority != 120: 336 | tdict['priority'] = v.priority - 120 337 | tdict['lock'] = 'mutex{}'.format(pid) 338 | tdict['wait'] = { 'ref': 'shared{}'.format(pid), 339 | 'mutex': 'mutex{}'.format(pid) } 340 | tdict['unlock'] = 'mutex{}'.format(pid) 341 | tdict['run'] = 0 342 | else: 343 | tdict = threads_dict["tasks"][name] 344 | tdict['run'] += (runtime / 1000) / runevents 345 | tdict['instance'] += 1 346 | break 347 | 348 | if found: 349 | continue 350 | tdict['instance'] = 1 351 | tdict['loop'] = -1 352 | tdict['run'] = (runtime * total_time) / (runtime + sleeptime) 353 | if sleeptime > 0: 354 | tdict['sleep'] = (sleeptime * total_time) / (runtime + sleeptime) 355 | threads_dict["tasks"][v.pid] = tdict 356 | 357 | # we need to load the wake deps into our dicts. This isn't super awesome 358 | # because rt-app only does pthreads, so we'll lose any process->process wakeups, 359 | # but those shouldn't matter too much. We also have to search all the task 360 | # lists, because I'm shit at python and don't know a better way to do this 361 | for name,actions in threads_dict['tasks'].items(): 362 | if actions['instance'] > 1: 363 | actions['run'] /= actions['instance'] 364 | threads_list.append(threads_dict) 365 | 366 | 367 | # for task in threads_list: 368 | # if waker in task["tasks"] and sleeper in task["tasks"]: 369 | # task["tasks"][waker].append(('resume', sleeper)) 370 | # if sleeper not in suspends: 371 | # task["tasks"][sleeper].append(('suspend', sleeper)) 372 | # suspends.append(sleeper) 373 | # break 374 | 375 | # Now we have to sort our output. rt-app expects the thread instructions to be 376 | # in the order that they are executed. We don't have to worry about sorting the 377 | # threads themselves, just the actions. I shamelessly stole this from SO. 378 | # 379 | # The ordering for the waker should be 380 | # 381 | # loop->run->resume->sleep 382 | # 383 | # This is to simulate the producer getting a request, waking up the worker, and 384 | # going to sleep until the next thing shows up. The ordering for the worker 385 | # should be 386 | # 387 | # loop->suspend->run->sleep 388 | # 389 | # This simulates the thread waiting to be given work, waking up and then going 390 | # back to sleep to wait for the next work set. 391 | for task in threads_list: 392 | sort_order = ['instance', 'loop', 'priority', 'lock', 'wait', 'broad', 'unlock', 'run', 393 | 'sleep'] 394 | for name,actions in task['tasks'].items(): 395 | task['tasks'][name] = OrderedDict(sorted(actions.iteritems(), 396 | key=lambda (k, v): sort_order.index(k))) 397 | print(json.dumps(task, indent=4)) 398 | -------------------------------------------------------------------------------- /snapshot-balance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | COMPILEBENCH=/root/compilebench-0.6/ 4 | DEV=/dev/nvme1n1 5 | MNT=/mnt/test 6 | SNAP_INTERVAL=1 7 | NUM_SNAPS=10 8 | 9 | _fail() { 10 | echo $1 11 | exit 1 12 | } 13 | 14 | _snap_thread() { 15 | local i=0 16 | local del=0 17 | local DEL_MOD=$(( NUM_SNAPS * 2 )) 18 | local DEL_SNAPS=$NUM_SNAPS 19 | while [ 1 ] 20 | do 21 | sleep $SNAP_INTERVAL 22 | btrfs sub snap $MNT $MNT/snaps/snap$i > /dev/null || \ 23 | _fail "failed to create snap$i" 24 | i=$(( i + 1 )) 25 | if [ "$(( i % DEL_MOD))" -eq "0" ] 26 | then 27 | for c in $(seq 1 $DEL_SNAPS) 28 | do 29 | btrfs subvolume delete $MNT/snaps/snap$del || \ 30 | _fail "failed to delete snap$del" 31 | del=$((del + 1 )) 32 | done 33 | btrfs balance start --full-balance --bg $MNT 34 | DEL_SNAPS=20 35 | fi 36 | done 37 | } 38 | 39 | _balance_thread() { 40 | while [ 1 ] 41 | do 42 | sleep $SNAP_INTERVAL 43 | btrfs balance start --full-balance $MNT || \ 44 | _fail "failed to balance" 45 | done 46 | } 47 | 48 | mkfs.btrfs -f -n 4096 $DEV || _fail "couldn't mkfs" 49 | mount $DEV $MNT || _fail "couldn't mount" 50 | 51 | mkdir $MNT/snaps 52 | _snap_thread & 53 | SNAP_PID=$! 54 | 55 | cd $COMPILEBENCH 56 | for i in $(seq 0 100) 57 | do 58 | ./compilebench -i 300 -m -D $MNT || break 59 | done 60 | 61 | [ "$?" -ne "0"] && echo "compilebench failed" 62 | 63 | btrfs balance cancel $MNT 64 | kill -9 $SNAP_PID 65 | 66 | wait 67 | -------------------------------------------------------------------------------- /socket-debug.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | import ctypes as ct 3 | 4 | b = BPF(text=""" 5 | #include 6 | #include 7 | #include 8 | 9 | BPF_HASH(holders, struct sock *); 10 | BPF_PERF_OUTPUT(events); 11 | 12 | int trace_sock_sendmsg(struct pt_regs *ctx, struct socket *socket) 13 | { 14 | struct sock *sk = socket->sk; 15 | struct inet_sock *inet = inet_sk(sk); 16 | u16 port = sk->__sk_common.skc_num; 17 | //port = ntohs(port); 18 | if (port == 0xcea) { 19 | u64 val = port; 20 | events.perf_submit(ctx, &val, sizeof(val)); 21 | } 22 | return 0; 23 | } 24 | """) 25 | b.attach_kprobe(event="sock_sendmsg", fn_name="trace_sock_sendmsg") 26 | 27 | def print_pid(cpu, data, size): 28 | event = ct.cast(data, ct.POINTER(ct.c_ulonglong)).contents 29 | print("pid %d is responsible" % (event.value)) 30 | 31 | b["events"].open_perf_buffer(print_pid) 32 | while 1: 33 | b.kprobe_poll() 34 | -------------------------------------------------------------------------------- /test-mmap-sync.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | static int file_fd; 13 | static uint64_t filesize = 1024 * 1024 * 1024; 14 | static uint64_t bs = 128 * 1024 * 1024; 15 | static int nr_threads = 3; 16 | static int loops = 100000; 17 | 18 | #define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) 19 | #define MAX(x, a) ((x) < (a) ? a : x) 20 | #define MIN(x, a) ((x) < (a) ? x : a) 21 | 22 | static void get_offset_size(uint64_t *offset, uint64_t *size) 23 | { 24 | *size = (uint64_t)random() % filesize; 25 | *offset = (uint64_t)random() % filesize; 26 | *offset = MIN(ALIGN(*offset, bs), filesize - bs); 27 | *size = MAX(ALIGN(*size, bs), bs); 28 | *size = MIN(*size, filesize - *offset); 29 | } 30 | 31 | static void *sync_file(void *arg) 32 | { 33 | int i, ret; 34 | 35 | for (i = 0; i < loops; i++) { 36 | ret = sync_file_range(file_fd, 0, filesize, 37 | SYNC_FILE_RANGE_WRITE); 38 | if (ret) { 39 | perror("Couldn't sync"); 40 | break; 41 | } 42 | sleep(2); 43 | } 44 | return NULL; 45 | } 46 | 47 | static void *mwrite_file(void *arg) 48 | { 49 | char fill = random(); 50 | char *ptr = mmap(NULL, filesize, PROT_WRITE, MAP_SHARED, file_fd, 0); 51 | uint64_t offset = 0; 52 | int i; 53 | 54 | if (ptr == MAP_FAILED) { 55 | perror("Mmap failed"); 56 | return NULL; 57 | } 58 | 59 | for (i = 0; i < loops; i++) { 60 | for (offset = bs; offset < filesize; offset += bs) { 61 | uint64_t off = offset - (1024 * 1024); 62 | uint64_t size = 2 * 1024 * 1024; 63 | memset(ptr + off, fill, size); 64 | } 65 | } 66 | return NULL; 67 | } 68 | 69 | static void *write_file(void *arg) 70 | { 71 | char fill = random(); 72 | char *buf; 73 | ssize_t ret; 74 | uint64_t offset; 75 | int i; 76 | 77 | buf = malloc(bs); 78 | if (!buf) { 79 | perror("Couldn't allocate temporary buffer"); 80 | return NULL; 81 | } 82 | 83 | memset(buf, fill, bs); 84 | for (i = 0; i < loops; i++) { 85 | for (offset = 0; offset < filesize; offset += bs) { 86 | ret = pwrite(file_fd, buf, bs, offset); 87 | if (ret < 0) { 88 | perror("Failed to write fd"); 89 | goto out; 90 | } 91 | } 92 | } 93 | out: 94 | free(buf); 95 | return NULL; 96 | } 97 | 98 | int main(int argc, char **argv) 99 | { 100 | pthread_t *threads; 101 | int i, ret; 102 | 103 | file_fd = open("testfile", O_CREAT|O_RDWR|O_TRUNC, 0644); 104 | if (file_fd < 0) { 105 | perror("Failed to open file_fd"); 106 | return -1; 107 | } 108 | 109 | if (ftruncate(file_fd, filesize)) { 110 | perror("Ftruncate failed"); 111 | return -1; 112 | } 113 | 114 | threads = malloc(sizeof(pthread_t) * nr_threads); 115 | if (!threads) { 116 | perror("Couldn't allocate threads array"); 117 | return -1; 118 | } 119 | memset(threads, 0, sizeof(pthread_t) * nr_threads); 120 | 121 | for (i = 0; i < nr_threads - 1; i++) { 122 | if (i % 2) 123 | ret = pthread_create(&threads[i], NULL, write_file, NULL); 124 | else 125 | ret = pthread_create(&threads[i], NULL, mwrite_file, NULL); 126 | if (ret) { 127 | perror("Failed to create thread"); 128 | goto out; 129 | } 130 | } 131 | ret = pthread_create(&threads[nr_threads - 1], NULL, sync_file, NULL); 132 | if (ret) { 133 | perror("Failed to create sync thread"); 134 | goto out; 135 | } 136 | out: 137 | for (i = 0; i < nr_threads; i++) { 138 | ret = pthread_join(threads[i], NULL); 139 | if (ret) { 140 | perror("Couldn't pthread_join"); 141 | return -1; 142 | } 143 | } 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /test-parse.py: -------------------------------------------------------------------------------- 1 | from kernelparse import FileParser 2 | 3 | f = open("test.c") 4 | p = FileParser() 5 | cg = p.parse_file(f) 6 | 7 | funcs = ['box', 'bean', 'boo', 'baz', 'foo', 'main'] 8 | if set(funcs) != set(cg.functions.keys()): 9 | print("FAILED: didn't find all the functions {}",format(cg.functions)) 10 | exit(1) 11 | print("SUCCESS!") 12 | -------------------------------------------------------------------------------- /timing-everything.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | import ctypes as ct 3 | import argparse 4 | import subprocess 5 | 6 | bpf_text = """ 7 | #include 8 | """ 9 | 10 | sections = {} 11 | sections['struct'] = "typedef struct data_s {\nu64 pid;\n" 12 | sections['hashes'] = "BPF_PERF_OUTPUT(events);\n" 13 | sections['funcs'] = "" 14 | sections['end_func'] = "" 15 | sections['start_func'] = "" 16 | to_attach = [] 17 | 18 | def sanitize_name(func): 19 | # Some function names get optimized to include .isra in their name, which 20 | # makes everything puke, so sanitize these names into something different. 21 | if '.' in func: 22 | return func.split('.')[0] 23 | return func 24 | 25 | def add_main_func(func_name, thresh, sections, attach): 26 | name = sanitize_name(func_name) 27 | sections['struct'] += "u64 {}_duration;\n".format(name) 28 | sections['hashes'] += "BPF_HASH({}_time, u64, u64);\n".format(name) 29 | sections['start_func'] = """ 30 | int trace_start_NAME(struct pt_regs *ctx) { 31 | u64 pid = bpf_get_current_pid_tgid(); 32 | u64 ts = bpf_ktime_get_ns(); 33 | u64 zero = 0; 34 | 35 | NAME_time.update(&pid, &ts); 36 | """.replace('NAME', name) 37 | 38 | sections['end_func'] = """ 39 | int trace_stop_NAME(struct pt_regs *ctx) { 40 | u64 pid = bpf_get_current_pid_tgid(); 41 | u64 delta; 42 | u64 *val; 43 | 44 | val = NAME_time.lookup(&pid); 45 | if (!val) 46 | return 0; 47 | 48 | delta = bpf_ktime_get_ns() - *val; 49 | if (delta < THRESHOLDL) 50 | return 0; 51 | 52 | data_t d = { 53 | .pid = (u32)pid, 54 | .NAME_duration = delta, 55 | }; 56 | """.replace('NAME', name).replace('THRESHOLD', str(thresh)) 57 | attach.append((name, func_name)) 58 | 59 | return "{}_duration".format(name) 60 | 61 | def add_function(func_name, sections, attach, siblings): 62 | name = sanitize_name(func_name) 63 | siblings.append("{}_duration".format(name)) 64 | sections['struct'] += "u64 {}_duration;\n".format(name) 65 | sections['hashes'] += "BPF_HASH({}_start, u64, u64);\n".format(name) 66 | sections['hashes'] += "BPF_HASH({}_time, u64, u64);\n".format(name) 67 | sections['funcs'] += """ 68 | int trace_start_NAME(struct pt_regs *ctx) { 69 | u64 pid = bpf_get_current_pid_tgid(); 70 | u64 ts = bpf_ktime_get_ns(); 71 | 72 | NAME_start.update(&pid, &ts); 73 | return 0; 74 | } 75 | 76 | int trace_stop_NAME(struct pt_regs *ctx) { 77 | u64 pid = bpf_get_current_pid_tgid(); 78 | u64 delta, zero = 0; 79 | u64 *val; 80 | 81 | val = NAME_start.lookup(&pid); 82 | if (!val) 83 | return 0; 84 | 85 | delta = bpf_ktime_get_ns() - *val; 86 | val = NAME_time.lookup_or_init(&pid, &zero); 87 | lock_xadd(val, delta); 88 | return 0; 89 | } 90 | """.replace('NAME', name) 91 | 92 | sections['end_func'] += """ 93 | val = NAME_time.lookup(&pid); 94 | if (val) 95 | d.NAME_duration = *val; 96 | """.replace('NAME', name) 97 | 98 | sections['start_func'] += """ 99 | NAME_time.update(&pid, &zero); 100 | """.replace('NAME', name) 101 | 102 | attach.append((name, func_name)) 103 | 104 | siblings = [] 105 | 106 | parser = argparse.ArgumentParser(description="Trace some bullshit") 107 | parser.add_argument('--children', type=str, nargs='+', default=[], 108 | help="Any children you want to trace under the main function") 109 | parser.add_argument('--main', type=str, required=True, 110 | help="The main function to trace") 111 | parser.add_argument('--threshold', type=int, default=1000000, 112 | help="Only worry about events that take X ns, defaults to 1ms") 113 | args = parser.parse_args() 114 | 115 | main = add_main_func(args.main, args.threshold, sections, to_attach) 116 | for c in args.children: 117 | add_function(c, sections, to_attach, siblings) 118 | 119 | sections['end_func'] += """ 120 | events.perf_submit(ctx, &d, sizeof(d)); 121 | return 0; 122 | } 123 | """ 124 | 125 | sections['start_func'] += """ 126 | return 0; 127 | } 128 | """ 129 | 130 | sections['struct'] += "} data_t;\n" 131 | 132 | bpf_text += sections['struct'] + sections['hashes'] + sections['funcs'] 133 | bpf_text += sections['start_func'] + sections['end_func'] 134 | print(bpf_text) 135 | b = BPF(text=bpf_text) 136 | 137 | for n,f in to_attach: 138 | b.attach_kretprobe(event=f, fn_name="trace_stop_{}".format(n)) 139 | b.attach_kprobe(event=f, fn_name="trace_start_{}".format(n)) 140 | 141 | def print_val(event, name): 142 | val = getattr(event, name) 143 | main_val = getattr(event, main) 144 | print("\t{} ns {}% {}".format(val, float(val / main_val * 100), name)) 145 | 146 | def print_data(cpu, data, size): 147 | event = b['events'].event(data) 148 | print("{} took {} ns".format(event.pid, getattr(event, main))) 149 | for n in siblings: 150 | print_val(event, n) 151 | subprocess.run(['kernelctl', 'walker']) 152 | 153 | b["events"].open_perf_buffer(print_data) 154 | 155 | print("tracing...") 156 | while True: 157 | try: 158 | print("probing") 159 | b.kprobe_poll() 160 | except KeyboardInterrupt: 161 | exit() 162 | -------------------------------------------------------------------------------- /timing.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | import ctypes as ct 3 | 4 | bpf_text = """ 5 | #include 6 | 7 | typedef struct data_s { 8 | u64 pid; 9 | u64 read_duration; 10 | u64 get_extent_duration; 11 | u64 lock_and_flush_duration; 12 | u64 submit_bio_duration; 13 | u64 csum_duration; 14 | u64 csum_count; 15 | u64 read_eb_duration; 16 | u64 read_eb_count; 17 | u64 io_schedule_duration; 18 | u64 bio_duration; 19 | u64 req_duration; 20 | } data_t; 21 | 22 | typedef struct bio_data_s { 23 | u64 ts; 24 | u64 pid; 25 | } bio_data_t; 26 | 27 | BPF_HASH(read_time, u64, u64); 28 | BPF_HASH(get_extent_time, u64, u64); 29 | BPF_HASH(get_extent_start, u64, u64); 30 | BPF_HASH(lock_and_flush_start, u64, u64); 31 | BPF_HASH(lock_and_flush_time, u64, u64); 32 | BPF_HASH(submit_bio_start, u64, u64); 33 | BPF_HASH(submit_bio_time, u64, u64); 34 | BPF_HASH(csum_start, u64, u64); 35 | BPF_HASH(csum_time, u64, u64); 36 | BPF_HASH(csum_count, u64, u64); 37 | BPF_HASH(read_eb_start, u64, u64); 38 | BPF_HASH(read_eb_time, u64, u64); 39 | BPF_HASH(read_eb_count, u64, u64); 40 | BPF_HASH(io_sched_start, u64, u64); 41 | BPF_HASH(io_sched_time, u64, u64); 42 | BPF_HASH(bio_start, struct bio *, bio_data_t); 43 | BPF_HASH(bio_time, u64, u64); 44 | BPF_HASH(req_tmp, u64, struct bio *); 45 | BPF_HASH(req_start, struct request *, bio_data_t); 46 | BPF_HASH(req_time, u64, u64); 47 | BPF_PERF_OUTPUT(events); 48 | 49 | int trace_blk_mq_get_request_ret(struct pt_regs *ctx) 50 | { 51 | u64 pid = bpf_get_current_pid_tgid(); 52 | u64 *val; 53 | 54 | val = read_eb_start.lookup(&pid); 55 | if (!val) 56 | return 0; 57 | bio_data_t d = { 58 | .pid = pid, 59 | .ts = bpf_ktime_get_ns(), 60 | }; 61 | 62 | struct request *rq = (struct request *)PT_REGS_RC(ctx); 63 | req_start.update(&rq, &d); 64 | return 0; 65 | } 66 | 67 | int trace_blk_mq_end_request(struct pt_regs *ctx, struct request *rq) 68 | { 69 | bio_data_t *d = req_start.lookup(&rq); 70 | if (!d) 71 | return 0; 72 | u64 pid = d->pid; 73 | u64 zero = 0; 74 | u64 *val = req_time.lookup_or_init(&pid, &zero); 75 | lock_xadd(val, bpf_ktime_get_ns() - d->ts); 76 | return 0; 77 | } 78 | 79 | int trace_submit_one_bio(struct pt_regs *ctx, struct bio *bio) 80 | { 81 | u64 pid = bpf_get_current_pid_tgid(); 82 | u64 *val; 83 | 84 | val = read_eb_start.lookup(&pid); 85 | if (!val) 86 | return 0; 87 | 88 | bio_data_t d = { 89 | .ts = bpf_ktime_get_ns(), 90 | .pid = pid, 91 | }; 92 | bio_start.update(&bio, &d); 93 | return 0; 94 | } 95 | 96 | int trace_end_bio_extent_readpage(struct pt_regs *ctx, struct bio *bio) 97 | { 98 | bio_data_t *d = bio_start.lookup(&bio); 99 | if (!d) 100 | return 0; 101 | u64 zero = 0; 102 | u64 pid = d->pid; 103 | u64 *val = bio_time.lookup_or_init(&pid, &zero); 104 | lock_xadd(val, bpf_ktime_get_ns() - d->ts); 105 | return 0; 106 | } 107 | 108 | int trace_start_io_sched(struct pt_regs *ctx) { 109 | u64 pid = bpf_get_current_pid_tgid(); 110 | u64 ts = bpf_ktime_get_ns(); 111 | 112 | io_sched_start.update(&pid, &ts); 113 | return 0; 114 | } 115 | 116 | int trace_stop_io_sched(struct pt_regs *ctx) { 117 | u64 pid = bpf_get_current_pid_tgid(); 118 | u64 delta, zero = 0; 119 | u64 *val; 120 | 121 | val = io_sched_start.lookup(&pid); 122 | if (!val) 123 | return 0; 124 | 125 | delta = bpf_ktime_get_ns() - *val; 126 | val = io_sched_time.lookup_or_init(&pid, &zero); 127 | lock_xadd(val, delta); 128 | 129 | return 0; 130 | } 131 | int trace_start_read_eb(struct pt_regs *ctx) { 132 | u64 pid = bpf_get_current_pid_tgid(); 133 | u64 ts = bpf_ktime_get_ns(); 134 | 135 | read_eb_start.update(&pid, &ts); 136 | return 0; 137 | } 138 | 139 | int trace_stop_read_eb(struct pt_regs *ctx) { 140 | u64 pid = bpf_get_current_pid_tgid(); 141 | u64 delta, zero = 0; 142 | u64 *val; 143 | 144 | val = read_eb_start.lookup(&pid); 145 | if (!val) 146 | return 0; 147 | 148 | delta = bpf_ktime_get_ns() - *val; 149 | val = read_eb_time.lookup_or_init(&pid, &zero); 150 | lock_xadd(val, delta); 151 | 152 | val = read_eb_count.lookup_or_init(&pid, &zero); 153 | lock_xadd(val, 1); 154 | return 0; 155 | } 156 | 157 | int trace_start_csum(struct pt_regs *ctx) { 158 | u64 pid = bpf_get_current_pid_tgid(); 159 | u64 ts = bpf_ktime_get_ns(); 160 | 161 | csum_start.update(&pid, &ts); 162 | return 0; 163 | } 164 | 165 | int trace_stop_csum(struct pt_regs *ctx) { 166 | u64 pid = bpf_get_current_pid_tgid(); 167 | u64 delta, zero = 0; 168 | u64 *val; 169 | 170 | val = csum_start.lookup(&pid); 171 | if (!val) 172 | return 0; 173 | 174 | delta = bpf_ktime_get_ns() - *val; 175 | val = csum_time.lookup_or_init(&pid, &zero); 176 | lock_xadd(val, delta); 177 | 178 | val = csum_count.lookup_or_init(&pid, &zero); 179 | lock_xadd(val, 1); 180 | return 0; 181 | } 182 | 183 | int trace_start_submit_bio(struct pt_regs *ctx) { 184 | u64 pid = bpf_get_current_pid_tgid(); 185 | u64 ts = bpf_ktime_get_ns(); 186 | 187 | submit_bio_start.update(&pid, &ts); 188 | return 0; 189 | } 190 | 191 | int trace_stop_submit_bio(struct pt_regs *ctx) { 192 | u64 pid = bpf_get_current_pid_tgid(); 193 | u64 delta, zero = 0; 194 | u64 *val; 195 | 196 | val = submit_bio_start.lookup(&pid); 197 | if (!val) 198 | return 0; 199 | 200 | delta = bpf_ktime_get_ns() - *val; 201 | val = submit_bio_time.lookup_or_init(&pid, &zero); 202 | lock_xadd(val, delta); 203 | return 0; 204 | } 205 | 206 | int trace_start_lock_and_flush(struct pt_regs *ctx) { 207 | u64 pid = bpf_get_current_pid_tgid(); 208 | u64 ts = bpf_ktime_get_ns(); 209 | 210 | lock_and_flush_start.update(&pid, &ts); 211 | return 0; 212 | } 213 | 214 | int trace_stop_lock_and_flush(struct pt_regs *ctx) { 215 | u64 pid = bpf_get_current_pid_tgid(); 216 | u64 delta, zero = 0; 217 | u64 *val; 218 | 219 | val = lock_and_flush_start.lookup(&pid); 220 | if (!val) 221 | return 0; 222 | 223 | delta = bpf_ktime_get_ns() - *val; 224 | val = lock_and_flush_time.lookup_or_init(&pid, &zero); 225 | lock_xadd(val, delta); 226 | return 0; 227 | } 228 | 229 | int trace_start_get_extent(struct pt_regs *ctx) { 230 | u64 pid = bpf_get_current_pid_tgid(); 231 | u64 ts = bpf_ktime_get_ns(); 232 | 233 | get_extent_start.update(&pid, &ts); 234 | return 0; 235 | } 236 | 237 | int trace_stop_get_extent(struct pt_regs *ctx) { 238 | u64 pid = bpf_get_current_pid_tgid(); 239 | u64 delta, zero = 0; 240 | u64 *val; 241 | 242 | val = get_extent_start.lookup(&pid); 243 | if (!val) 244 | return 0; 245 | 246 | delta = bpf_ktime_get_ns() - *val; 247 | val = get_extent_time.lookup_or_init(&pid, &zero); 248 | lock_xadd(val, delta); 249 | return 0; 250 | } 251 | 252 | int trace_start_read(struct pt_regs *ctx) { 253 | u64 pid = bpf_get_current_pid_tgid(); 254 | u64 ts = bpf_ktime_get_ns(); 255 | u64 zero = 0; 256 | u64 tgid = pid >> 32; 257 | 258 | if (tgid != 665225) 259 | return 0; 260 | 261 | read_time.update(&pid, &ts); 262 | get_extent_time.update(&pid, &zero); 263 | lock_and_flush_time.update(&pid, &zero); 264 | submit_bio_time.update(&pid, &zero); 265 | csum_time.update(&pid, &zero); 266 | csum_count.update(&pid, &zero); 267 | read_eb_time.update(&pid, &zero); 268 | read_eb_count.update(&pid, &zero); 269 | io_sched_time.update(&pid, &zero); 270 | bio_time.update(&pid, &zero); 271 | req_time.update(&pid, &zero); 272 | return 0; 273 | } 274 | 275 | int trace_stop_read(struct pt_regs *ctx) { 276 | u64 pid = bpf_get_current_pid_tgid(); 277 | u64 delta; 278 | u64 *val; 279 | 280 | val = read_time.lookup(&pid); 281 | if (!val) 282 | return 0; 283 | 284 | delta = bpf_ktime_get_ns() - *val; 285 | if (delta < 1000000L) { 286 | bpf_trace_printk("latency was %llu\\n", delta); 287 | return 0; 288 | } 289 | 290 | data_t d = { 291 | .pid = pid, 292 | .read_duration = delta, 293 | }; 294 | 295 | val = get_extent_time.lookup(&pid); 296 | if (val) 297 | d.get_extent_duration = *val; 298 | 299 | val = lock_and_flush_time.lookup(&pid); 300 | if (val) 301 | d.lock_and_flush_duration = *val; 302 | 303 | val = submit_bio_time.lookup(&pid); 304 | if (val) 305 | d.submit_bio_duration = *val; 306 | 307 | val = csum_time.lookup(&pid); 308 | if (val) 309 | d.csum_duration = *val; 310 | 311 | val = csum_count.lookup(&pid); 312 | if (val) 313 | d.csum_count = *val; 314 | 315 | val = read_eb_time.lookup(&pid); 316 | if (val) 317 | d.read_eb_duration = *val; 318 | 319 | val = read_eb_count.lookup(&pid); 320 | if (val) 321 | d.read_eb_count = *val; 322 | 323 | val = io_sched_time.lookup(&pid); 324 | if (val) 325 | d.io_schedule_duration = *val; 326 | 327 | val = bio_time.lookup(&pid); 328 | if (val) 329 | d.bio_duration = *val; 330 | 331 | val = req_time.lookup(&pid); 332 | if (val) 333 | d.req_duration = *val; 334 | 335 | events.perf_submit(ctx, &d, sizeof(d)); 336 | return 0; 337 | } 338 | """ 339 | 340 | b = BPF(text=bpf_text) 341 | b.attach_kprobe(event="extent_readpages", fn_name="trace_start_read") 342 | b.attach_kretprobe(event="extent_readpages", fn_name="trace_stop_read") 343 | b.attach_kprobe(event="btrfs_get_extent", fn_name="trace_start_get_extent") 344 | b.attach_kretprobe(event="btrfs_get_extent", fn_name="trace_stop_get_extent") 345 | b.attach_kprobe(event="btrfs_lock_and_flush_ordered_range", fn_name="trace_start_lock_and_flush") 346 | b.attach_kretprobe(event="btrfs_lock_and_flush_ordered_range", fn_name="trace_stop_lock_and_flush") 347 | b.attach_kprobe(event="submit_one_bio", fn_name="trace_start_submit_bio") 348 | b.attach_kretprobe(event="submit_one_bio", fn_name="trace_stop_submit_bio") 349 | b.attach_kprobe(event="btrfs_lookup_csum", fn_name="trace_start_csum") 350 | b.attach_kretprobe(event="btrfs_lookup_csum", fn_name="trace_stop_csum") 351 | b.attach_kprobe(event="read_extent_buffer_pages", fn_name="trace_start_read_eb") 352 | b.attach_kretprobe(event="read_extent_buffer_pages", fn_name="trace_stop_read_eb") 353 | b.attach_kprobe(event="io_schedule", fn_name="trace_start_io_sched") 354 | b.attach_kretprobe(event="io_schedule", fn_name="trace_stop_io_sched") 355 | b.attach_kprobe(event="submit_bio", fn_name="trace_submit_one_bio") 356 | b.attach_kprobe(event="end_workqueue_bio", fn_name="trace_end_bio_extent_readpage") 357 | b.attach_kretprobe(event="blk_mq_get_request", fn_name="trace_blk_mq_get_request_ret") 358 | b.attach_kprobe(event="blk_mq_end_request", fn_name="trace_blk_mq_end_request") 359 | 360 | def print_data(cpu, data, size): 361 | event = b['events'].event(data) 362 | print("{} took {} ns to read".format(event.pid, event.read_duration)) 363 | print("\t{} ns {}% get extent".format(event.get_extent_duration, 364 | float(event.get_extent_duration / event.read_duration * 100))) 365 | print("\t{} ns {}% lock and flush".format(event.lock_and_flush_duration, 366 | float(event.lock_and_flush_duration / event.read_duration * 100))) 367 | print("\t{} ns {}% submit_bio".format(event.submit_bio_duration, 368 | float(event.submit_bio_duration / event.read_duration * 100))) 369 | print("\t{} ns {}% io_schedule".format(event.io_schedule_duration, 370 | float(event.io_schedule_duration / event.read_duration * 100))) 371 | print("\t{} ns {}% csum count {}".format(event.csum_duration, 372 | float(event.csum_duration / event.read_duration * 100), 373 | event.csum_count)) 374 | print("\t{} ns {}% read_eb count {}".format(event.read_eb_duration, 375 | float(event.read_eb_duration / event.read_duration * 100), 376 | event.read_eb_count)) 377 | print("\t{} ns {}% bio io time".format(event.bio_duration, 378 | float(event.bio_duration / event.read_duration * 100))) 379 | print("\t{} ns {}% req io time".format(event.req_duration, 380 | float(event.req_duration / event.read_duration * 100))) 381 | 382 | b["events"].open_perf_buffer(print_data) 383 | 384 | print("tracing...") 385 | while True: 386 | try: 387 | print("probing") 388 | b.kprobe_poll() 389 | except KeyboardInterrupt: 390 | exit() 391 | -------------------------------------------------------------------------------- /unbalanced-reproducer/new-unbalanced.json: -------------------------------------------------------------------------------- 1 | { 2 | "global": { 3 | "duration": 60 4 | }, 5 | "tasks": { 6 | "3980936": { 7 | "instance": 1, 8 | "loop": -1, 9 | "lock": "mutex3980936", 10 | "broad": "shared3980936", 11 | "unlock": "mutex3980936", 12 | "sleep": 0 13 | }, 14 | "3980916": { 15 | "instance": 1, 16 | "loop": -1, 17 | "lock": "mutex3980916", 18 | "broad": "shared3980916", 19 | "unlock": "mutex3980916", 20 | "sleep": 0 21 | }, 22 | "3980867": { 23 | "instance": 2, 24 | "loop": -1, 25 | "priority": -7, 26 | "lock": "mutex3980936", 27 | "wait": { 28 | "mutex": "mutex3980936", 29 | "ref": "shared3980936" 30 | }, 31 | "broad": "shared3980867", 32 | "unlock": "mutex3980936", 33 | "run": 35, 34 | "sleep": 0 35 | }, 36 | "3979037": { 37 | "instance": 1, 38 | "loop": -1, 39 | "run": 1000000 40 | }, 41 | "3980958": { 42 | "instance": 1, 43 | "loop": -1, 44 | "lock": "mutex3980958", 45 | "broad": "shared3980958", 46 | "unlock": "mutex3980958", 47 | "sleep": 0 48 | }, 49 | "3979077": { 50 | "instance": 1, 51 | "loop": -1, 52 | "run": 53, 53 | "sleep": 999946 54 | }, 55 | "3979041": { 56 | "instance": 1, 57 | "loop": -1, 58 | "run": 20, 59 | "sleep": 999979 60 | }, 61 | "3979043": { 62 | "instance": 1, 63 | "loop": -1, 64 | "run": 6672, 65 | "sleep": 993327 66 | }, 67 | "3979044": { 68 | "instance": 1, 69 | "loop": -1, 70 | "run": 2, 71 | "sleep": 999997 72 | }, 73 | "3979943": { 74 | "instance": 1, 75 | "loop": -1, 76 | "run": 7693, 77 | "sleep": 992306 78 | }, 79 | "3979944": { 80 | "instance": 1, 81 | "loop": -1, 82 | "run": 44, 83 | "sleep": 999955 84 | }, 85 | "3979945": { 86 | "instance": 1, 87 | "loop": -1, 88 | "run": 119, 89 | "sleep": 999880 90 | }, 91 | "3980970": { 92 | "instance": 1, 93 | "loop": -1, 94 | "run": 69, 95 | "sleep": 999930 96 | }, 97 | "3979079": { 98 | "instance": 1, 99 | "loop": -1, 100 | "run": 19, 101 | "sleep": 999980 102 | }, 103 | "threads3980936": { 104 | "instance": 41, 105 | "loop": -1, 106 | "lock": "mutex3980936", 107 | "wait": { 108 | "mutex": "mutex3980936", 109 | "ref": "shared3980936" 110 | }, 111 | "unlock": "mutex3980936", 112 | "run": 699 113 | }, 114 | "3980975": { 115 | "instance": 1, 116 | "loop": -1, 117 | "run": 60, 118 | "sleep": 999939 119 | }, 120 | "3980976": { 121 | "instance": 1, 122 | "loop": -1, 123 | "run": 127, 124 | "sleep": 999872 125 | }, 126 | "threads3980915": { 127 | "instance": 37, 128 | "loop": -1, 129 | "lock": "mutex3980915", 130 | "wait": { 131 | "mutex": "mutex3980915", 132 | "ref": "shared3980915" 133 | }, 134 | "unlock": "mutex3980915", 135 | "run": 328 136 | }, 137 | "threads3980958": { 138 | "instance": 15, 139 | "loop": -1, 140 | "lock": "mutex3980958", 141 | "wait": { 142 | "mutex": "mutex3980958", 143 | "ref": "shared3980958" 144 | }, 145 | "unlock": "mutex3980958", 146 | "run": 1079 147 | }, 148 | "threads3980916": { 149 | "instance": 2, 150 | "loop": -1, 151 | "lock": "mutex3980916", 152 | "wait": { 153 | "mutex": "mutex3980916", 154 | "ref": "shared3980916" 155 | }, 156 | "unlock": "mutex3980916", 157 | "run": 73 158 | }, 159 | "threads3980877": { 160 | "instance": 2, 161 | "loop": -1, 162 | "lock": "mutex3980877", 163 | "wait": { 164 | "mutex": "mutex3980877", 165 | "ref": "shared3980877" 166 | }, 167 | "unlock": "mutex3980877", 168 | "run": 345 169 | }, 170 | "threads3980872": { 171 | "instance": 6, 172 | "loop": -1, 173 | "lock": "mutex3980872", 174 | "wait": { 175 | "mutex": "mutex3980872", 176 | "ref": "shared3980872" 177 | }, 178 | "broad": "shared3980871", 179 | "unlock": "mutex3980872", 180 | "run": 370, 181 | "sleep": 0 182 | }, 183 | "threads3980873": { 184 | "instance": 11, 185 | "loop": -1, 186 | "lock": "mutex3980873", 187 | "wait": { 188 | "mutex": "mutex3980873", 189 | "ref": "shared3980873" 190 | }, 191 | "unlock": "mutex3980873", 192 | "run": 4082 193 | }, 194 | "threads3980870": { 195 | "instance": 22, 196 | "loop": -1, 197 | "lock": "mutex3980870", 198 | "wait": { 199 | "mutex": "mutex3980870", 200 | "ref": "shared3980870" 201 | }, 202 | "broad": "shared3980869", 203 | "unlock": "mutex3980870", 204 | "run": 97, 205 | "sleep": 0 206 | }, 207 | "threads3980871": { 208 | "instance": 12, 209 | "loop": -1, 210 | "lock": "mutex3980871", 211 | "wait": { 212 | "mutex": "mutex3980871", 213 | "ref": "shared3980871" 214 | }, 215 | "unlock": "mutex3980871", 216 | "run": 1953 217 | }, 218 | "3979038": { 219 | "instance": 1, 220 | "loop": -1, 221 | "run": 1000000 222 | }, 223 | "3979075": { 224 | "instance": 1, 225 | "loop": -1, 226 | "run": 23, 227 | "sleep": 999976 228 | }, 229 | "3980868": { 230 | "instance": 1, 231 | "loop": -1, 232 | "priority": -7, 233 | "lock": "mutex3980868", 234 | "broad": "shared3980868", 235 | "unlock": "mutex3980868", 236 | "sleep": 0 237 | }, 238 | "3980869": { 239 | "instance": 22, 240 | "loop": -1, 241 | "lock": "mutex3980870", 242 | "wait": { 243 | "mutex": "mutex3980870", 244 | "ref": "shared3980870" 245 | }, 246 | "broad": "shared3980869", 247 | "unlock": "mutex3980870", 248 | "run": 97, 249 | "sleep": 0 250 | }, 251 | "3980870": { 252 | "instance": 1, 253 | "loop": -1, 254 | "lock": "mutex3980870", 255 | "broad": "shared3980870", 256 | "unlock": "mutex3980870", 257 | "sleep": 0 258 | }, 259 | "3980871": { 260 | "instance": 6, 261 | "loop": -1, 262 | "lock": "mutex3980872", 263 | "wait": { 264 | "mutex": "mutex3980872", 265 | "ref": "shared3980872" 266 | }, 267 | "broad": "shared3980871", 268 | "unlock": "mutex3980872", 269 | "run": 370, 270 | "sleep": 0 271 | }, 272 | "3980872": { 273 | "instance": 1, 274 | "loop": -1, 275 | "lock": "mutex3980872", 276 | "broad": "shared3980872", 277 | "unlock": "mutex3980872", 278 | "sleep": 0 279 | }, 280 | "3980873": { 281 | "instance": 1, 282 | "loop": -1, 283 | "lock": "mutex3980873", 284 | "broad": "shared3980873", 285 | "unlock": "mutex3980873", 286 | "sleep": 0 287 | }, 288 | "3979980": { 289 | "instance": 1, 290 | "loop": -1, 291 | "lock": "mutex3979980", 292 | "broad": "shared3979980", 293 | "unlock": "mutex3979980", 294 | "sleep": 0 295 | }, 296 | "3980877": { 297 | "instance": 1, 298 | "loop": -1, 299 | "lock": "mutex3980877", 300 | "broad": "shared3980877", 301 | "unlock": "mutex3980877", 302 | "sleep": 0 303 | }, 304 | "3979087": { 305 | "instance": 1, 306 | "loop": -1, 307 | "run": 292, 308 | "sleep": 999707 309 | }, 310 | "3979984": { 311 | "instance": 1, 312 | "loop": -1, 313 | "run": 1000000 314 | }, 315 | "threads3979980": { 316 | "instance": 13, 317 | "loop": -1, 318 | "lock": "mutex3979980", 319 | "wait": { 320 | "mutex": "mutex3979980", 321 | "ref": "shared3979980" 322 | }, 323 | "unlock": "mutex3979980", 324 | "run": 2408 325 | }, 326 | "3979107": { 327 | "instance": 1, 328 | "loop": -1, 329 | "run": 431281, 330 | "sleep": 568718 331 | }, 332 | "3979110": { 333 | "instance": 1, 334 | "loop": -1, 335 | "run": 8, 336 | "sleep": 999991 337 | }, 338 | "3980915": { 339 | "instance": 1, 340 | "loop": -1, 341 | "lock": "mutex3980915", 342 | "broad": "shared3980915", 343 | "unlock": "mutex3980915", 344 | "sleep": 0 345 | }, 346 | "threads3980869": { 347 | "instance": 16, 348 | "loop": -1, 349 | "lock": "mutex3980869", 350 | "wait": { 351 | "mutex": "mutex3980869", 352 | "ref": "shared3980869" 353 | }, 354 | "unlock": "mutex3980869", 355 | "run": 2192 356 | }, 357 | "threads3980868": { 358 | "instance": 10, 359 | "loop": -1, 360 | "lock": "mutex3980868", 361 | "wait": { 362 | "mutex": "mutex3980868", 363 | "ref": "shared3980868" 364 | }, 365 | "unlock": "mutex3980868", 366 | "run": 870 367 | }, 368 | "3980969": { 369 | "instance": 1, 370 | "loop": -1, 371 | "run": 48, 372 | "sleep": 999951 373 | }, 374 | "threads3980867": { 375 | "instance": 13, 376 | "loop": -1, 377 | "lock": "mutex3980867", 378 | "wait": { 379 | "mutex": "mutex3980867", 380 | "ref": "shared3980867" 381 | }, 382 | "unlock": "mutex3980867", 383 | "run": 614 384 | }, 385 | "3980795": { 386 | "instance": 1, 387 | "loop": -1, 388 | "run": 440, 389 | "sleep": 999559 390 | }, 391 | "3980798": { 392 | "instance": 1, 393 | "loop": -1, 394 | "run": 753, 395 | "sleep": 999246 396 | }, 397 | "threads3980936priority-7": { 398 | "instance": 2, 399 | "loop": -1, 400 | "priority": -7, 401 | "lock": "mutex3980936", 402 | "wait": { 403 | "mutex": "mutex3980936", 404 | "ref": "shared3980936" 405 | }, 406 | "broad": "shared3980867", 407 | "unlock": "mutex3980936", 408 | "run": 35, 409 | "sleep": 0 410 | } 411 | } 412 | } 413 | -------------------------------------------------------------------------------- /unbalanced-reproducer/unbalanced-v1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CGROUP_MAIN=/sys/fs/cgroup/cpuacct 4 | CGROUP_DIR=$CGROUP_MAIN/foo 5 | CGROUP_BASE=$CGROUP_DIR/interactive 6 | 7 | _isolate_run() 8 | { 9 | name=$1 10 | shift 11 | echo "running '$*'" 12 | echo $BASHPID > $name/cgroup.procs 13 | $* 14 | } 15 | 16 | if [ -d "$CGROUP_DIR" ] 17 | then 18 | rmdir $CGROUP_BASE/small 19 | rmdir $CGROUP_BASE/large 20 | rmdir -p $CGROUP_BASE 21 | fi 22 | 23 | mkdir -p $CGROUP_BASE/small 24 | mkdir $CGROUP_BASE/large 25 | echo 102400 > $CGROUP_BASE/small/cpu.shares 26 | echo 102400 > $CGROUP_BASE/large/cpu.shares 27 | 28 | _isolate_run $CGROUP_BASE/small rt-app new-unbalanced.json & 29 | RTPID=$! 30 | 31 | _isolate_run $CGROUP_BASE/large stress -c 48 & 32 | wait $RTPID 33 | pkill -9 stress 34 | wait 35 | echo "Small usage" 36 | cat $CGROUP_BASE/small/cpuacct.usage 37 | echo "Large usage" 38 | cat $CGROUP_BASE/large/cpuacct.usage 39 | -------------------------------------------------------------------------------- /unbalanced-reproducer/unbalanced.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CGROUP_MAIN=/sys/fs/cgroup 4 | CGROUP_DIR=$CGROUP_MAIN/foo 5 | CGROUP_BASE=$CGROUP_DIR/interactive 6 | 7 | _mkdir() { 8 | mkdir $1 9 | echo "+cpu" > $1/cgroup.subtree_control 10 | } 11 | 12 | _isolate_run() 13 | { 14 | name=$1 15 | shift 16 | echo "running '$*'" 17 | echo $BASHPID > $name/cgroup.procs 18 | $* 19 | } 20 | 21 | if [ -d "$CGROUP_DIR" ] 22 | then 23 | rmdir $CGROUP_BASE/small 24 | rmdir $CGROUP_BASE/large 25 | rmdir -p $CGROUP_BASE 26 | fi 27 | 28 | echo "+cpu" > $CGROUP_MAIN/cgroup.subtree_control 29 | _mkdir $CGROUP_DIR 30 | _mkdir $CGROUP_BASE 31 | mkdir $CGROUP_BASE/small 32 | mkdir $CGROUP_BASE/large 33 | echo 10000 > $CGROUP_BASE/small/cpu.weight 34 | echo 10000 > $CGROUP_BASE/large/cpu.weight 35 | 36 | _isolate_run $CGROUP_BASE/small rt-app new-unbalanced.json & 37 | #_isolate_run $CGROUP_BASE/small /root/schbench/schbench -r 60 -m 16 -t 350 -s 1000 & 38 | RTPID=$! 39 | 40 | _isolate_run $CGROUP_BASE/large stress -c 48 & 41 | wait $RTPID 42 | pkill -9 stress 43 | wait 44 | echo "Small usage" 45 | cat $CGROUP_BASE/small/cpu.stat 46 | echo "Large usage" 47 | cat $CGROUP_BASE/large/cpu.stat 48 | -------------------------------------------------------------------------------- /what-the-fuck-are-we-doing.py: -------------------------------------------------------------------------------- 1 | # The purpose of this is to print out how long all the processes are spending in 2 | # the various scheduler state. I used this to figure out exactly how badly 3 | # kswapd was fucking fs_mark during a heavy slab usage run. 4 | from bcc import BPF 5 | from time import sleep 6 | import argparse 7 | import signal 8 | 9 | def signal_ignore(signal, frame): 10 | print() 11 | 12 | class SignalInterrupt(Exception): 13 | def __init__(self, message): 14 | super(SignalInterrupt, self).__init__(message) 15 | 16 | def signal_stop(signal, frame): 17 | raise SignalInterrupt("Interrupted!") 18 | 19 | def pretty_time(value): 20 | if value < 1000000: 21 | return "{} ns".format(value) 22 | value /= 1000000 23 | if value < 1000: 24 | return "{} ms".format(value) 25 | value /= 1000 26 | return "{} secs".format(value) 27 | 28 | bpf_text = """ 29 | #include 30 | #include 31 | 32 | typedef struct sleep_event_s { 33 | u64 pid; 34 | u64 sleep_state; 35 | } sleep_event_t; 36 | 37 | typedef struct sleep_data_s { 38 | u64 time_spent; 39 | u64 num_events; 40 | } sleep_data_t; 41 | 42 | typedef struct comm_name_s { 43 | char name[TASK_COMM_LEN]; 44 | } comm_name_t; 45 | 46 | BPF_HASH(process_names, u64, comm_name_t); 47 | BPF_HASH(start, u64); 48 | BPF_HASH(runtime, u64, sleep_data_t); 49 | BPF_HASH(sleepreason, u64, sleep_event_t); 50 | BPF_HASH(sleeptime, sleep_event_t, sleep_data_t); 51 | 52 | static u64 task_pid_tgid(struct task_struct *task) 53 | { 54 | return (u64)task->tgid << 32 | task->pid; 55 | } 56 | 57 | int oncpu(struct pt_regs *ctx, struct task_struct *prev) 58 | { 59 | u64 pid = task_pid_tgid(prev); 60 | u64 ts = bpf_ktime_get_ns(), *tsp; 61 | u64 delta; 62 | sleep_data_t *d; 63 | sleep_event_t *e; 64 | sleep_data_t zero = {}; 65 | sleep_event_t event = { 66 | .sleep_state = prev->state, 67 | }; 68 | 69 | ADJUST_PID 70 | event.pid = pid; 71 | 72 | tsp = start.lookup(&pid); 73 | if (tsp) { 74 | delta = ts - *tsp; 75 | d = runtime.lookup_or_init(&pid, &zero); 76 | d->time_spent += delta; 77 | d->num_events++; 78 | } 79 | start.update(&pid, &ts); 80 | sleepreason.update(&pid, &event); 81 | 82 | pid = bpf_get_current_pid_tgid(); 83 | 84 | ADJUST_PID 85 | 86 | tsp = start.lookup(&pid); 87 | if (!tsp) { 88 | comm_name_t name; 89 | bpf_get_current_comm(&name.name, sizeof(name.name)); 90 | process_names.update(&pid, &name); 91 | goto out; 92 | } 93 | 94 | ts = bpf_ktime_get_ns(); 95 | delta = ts - *tsp; 96 | 97 | event.pid = pid; 98 | e = sleepreason.lookup(&pid); 99 | if (!e) { 100 | /* this probably shouldn't happen, but if it does put a bogus sleep 101 | state value in there so we know it happened. */ 102 | event.sleep_state = 10; 103 | } else { 104 | event.sleep_state = e->sleep_state & 3; 105 | } 106 | d = sleeptime.lookup_or_init(&event, &zero); 107 | d->time_spent += delta; 108 | d->num_events++; 109 | out: 110 | start.update(&pid, &ts); 111 | return 0; 112 | } 113 | """ 114 | 115 | parser = argparse.ArgumentParser() 116 | parser.add_argument("-g", "--group", action='store_true', 117 | help="Group child threads together in the output") 118 | args = parser.parse_args() 119 | 120 | if args.group: 121 | bpf_text = bpf_text.replace('ADJUST_PID', 'pid = (u32)pid;') 122 | else: 123 | bpf_text = bpf_text.replace('ADJUST_PID', '') 124 | 125 | b = BPF(text=bpf_text) 126 | b.attach_kprobe(event="finish_task_switch", fn_name="oncpu") 127 | 128 | print("Tracing, hit Ctrl+C to exit") 129 | signal.signal(signal.SIGINT, signal_stop) 130 | try: 131 | sleep(99999999) 132 | except KeyboardInterrupt: 133 | signal.signal(signal.SIGINT, signal_ignore) 134 | except SignalInterrupt: 135 | signal.signal(signal.SIGINT, signal_ignore) 136 | 137 | sleep_table = b.get_table("sleeptime") 138 | run_table = b.get_table("runtime") 139 | process_names = b.get_table("process_names") 140 | 141 | processes = [] 142 | proc_names = {} 143 | 144 | for k,v in sorted(run_table.items(), key=lambda run: run[1].time_spent, reverse=True): 145 | process = {} 146 | process['pid'] = k.value 147 | process['runtime'] = v.time_spent 148 | process['switches'] = v.num_events 149 | process['sleeptime'] = {} 150 | process['threads'] = 0 151 | 152 | name = "{}".format(process['pid'] >> 32) 153 | for k,v in process_names.items(): 154 | if process['pid'] == k.value: 155 | name = v.name 156 | break 157 | process['name'] = name 158 | for k,v in sleep_table.items(): 159 | if process['pid'] == k.pid: 160 | process['sleeptime'][k.sleep_state] = {} 161 | process['sleeptime'][k.sleep_state]['time'] = v.time_spent 162 | process['sleeptime'][k.sleep_state]['switches'] = v.num_events 163 | if args.group and name in proc_names: 164 | tmp = proc_names[name] 165 | tmp['runtime'] += process['runtime'] 166 | tmp['switches'] += process['switches'] 167 | tmp['threads'] += 1 168 | for k,v in process['sleeptime'].items(): 169 | if k in tmp['sleeptime']: 170 | tmp['sleeptime'][k]['time'] += v['time'] 171 | tmp['sleeptime'][k]['switches'] += v['switches'] 172 | else: 173 | tmp['sleeptime'][k] = {} 174 | tmp['sleeptime'][k]['time'] = v['time'] 175 | tmp['sleeptime'][k]['switches'] = v['switches'] 176 | else: 177 | proc_names[name] = process 178 | processes.append(process) 179 | 180 | if args.group: 181 | processes = [] 182 | for k,v in sorted(proc_names.items(), key=lambda proc: proc[1]['runtime'], reverse=True): 183 | processes.append(v) 184 | 185 | for process in processes: 186 | name = process['name'] 187 | pid = process['pid'] 188 | runtime = process['runtime'] 189 | switches = process['switches'] 190 | output = "Proces {} (pid {}) contains {} ran for {} and was switched {} times".format(name, 191 | pid, process['threads'], pretty_time(runtime), switches) 192 | for k,v in process['sleeptime'].items(): 193 | output += ", slept in state {} {} times for {}".format(k, v['switches'], 194 | pretty_time(v['time'])) 195 | print(output) 196 | -------------------------------------------------------------------------------- /xfs-get-blocks.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from bcc import BPF 3 | import ctypes as ct 4 | 5 | debug = 0 6 | 7 | bpf_text = """ 8 | #include 9 | #include 10 | #include 11 | 12 | typedef struct rkey_s { 13 | u64 read_size; 14 | u64 bio_size; 15 | u64 num_bios; 16 | u64 add_to_page_cache_failures; 17 | } rkey_t; 18 | 19 | typedef struct bkey_s { 20 | struct buffer_head *map_bh; 21 | u64 b_size; 22 | u64 b_orig_state; 23 | } bkey_t; 24 | 25 | typedef struct data_s { 26 | u64 readpages_size; 27 | u64 bio_size; 28 | u64 num_bios; 29 | u64 add_to_page_cache_failures; 30 | } data_t; 31 | 32 | typedef struct bdata_s { 33 | u64 b_size; 34 | u64 b_found_size; 35 | u64 b_state; 36 | u64 b_orig_state; 37 | } bdata_t; 38 | 39 | typedef struct read_data_s { 40 | u64 bio_ptr; 41 | u64 last_block_in_bio; 42 | u64 first_logical_block; 43 | u64 max_vecs; 44 | u64 page_index; 45 | } read_data_t; 46 | 47 | BPF_HASH(mappings, u64, rkey_t); 48 | BPF_HASH(buffers, u64, bkey_t); 49 | BPF_HASH(readhash, u64, read_data_t); 50 | BPF_PERF_OUTPUT(events); 51 | BPF_PERF_OUTPUT(bevents); 52 | BPF_PERF_OUTPUT(revents); 53 | 54 | int trace_mpage_readpages(struct pt_regs *ctx, struct address_space *mapping, 55 | struct list_head *pages, unsigned nr_pages) 56 | { 57 | u64 magic = mapping->host->i_sb->s_magic; 58 | if (magic != 0x58465342) 59 | return 0; 60 | 61 | rkey_t key = { 62 | .read_size = nr_pages << PAGE_SHIFT, 63 | }; 64 | u64 pid = bpf_get_current_pid_tgid(); 65 | mappings.update(&pid, &key); 66 | return 0; 67 | } 68 | 69 | int trace_mpage_readpages_return(struct pt_regs *ctx) 70 | { 71 | rkey_t *key; 72 | u64 pid = bpf_get_current_pid_tgid(); 73 | 74 | key = mappings.lookup(&pid); 75 | if (!key) 76 | return 0; 77 | 78 | data_t data = { 79 | .readpages_size = key->read_size, 80 | .num_bios = key->num_bios, 81 | .bio_size = key->bio_size, 82 | .add_to_page_cache_failures = key->add_to_page_cache_failures, 83 | }; 84 | events.perf_submit(ctx, &data, sizeof(data)); 85 | mappings.delete(&pid); 86 | return 0; 87 | } 88 | 89 | int trace_exit_add_to_page_cache_lru(struct pt_regs *ctx) 90 | { 91 | u64 pid = bpf_get_current_pid_tgid(); 92 | rkey_t *key; 93 | 94 | key = mappings.lookup(&pid); 95 | if (!key) 96 | return 0; 97 | if (PT_REGS_RC(ctx) != 0) 98 | key->add_to_page_cache_failures++; 99 | return 0; 100 | } 101 | 102 | int trace_submit_bio(struct pt_regs *ctx, int rw, struct bio *bio) 103 | { 104 | if ((rw & 1) == 1) 105 | return 0; 106 | rkey_t *key; 107 | u64 pid = bpf_get_current_pid_tgid(); 108 | 109 | key = mappings.lookup(&pid); 110 | if (!key) 111 | return 0; 112 | key->num_bios++; 113 | key->bio_size += bio->bi_iter.bi_size; 114 | return 0; 115 | } 116 | 117 | int trace_get_blocks(struct pt_regs *ctx, struct inode *inode, 118 | sector_t block, struct buffer_head *map_bh, 119 | int create) 120 | { 121 | if (create) 122 | return 0; 123 | u64 pid = bpf_get_current_pid_tgid(); 124 | rkey_t *rkey; 125 | 126 | rkey = mappings.lookup(&pid); 127 | if (!rkey) 128 | return 0; 129 | 130 | bkey_t key = { 131 | .map_bh = map_bh, 132 | .b_size = map_bh->b_size, 133 | .b_orig_state = map_bh->b_state, 134 | }; 135 | buffers.update(&pid, &key); 136 | return 0; 137 | } 138 | 139 | int trace_exit_get_blocks(struct pt_regs *ctx) 140 | { 141 | u64 pid = bpf_get_current_pid_tgid(); 142 | bkey_t *key; 143 | rkey_t *rkey; 144 | 145 | key = buffers.lookup(&pid); 146 | if (!key) 147 | return 0; 148 | 149 | u64 size, state; 150 | 151 | // the rewriter doesn't recognize this as needing a probe read, so do 152 | // it ourselves 153 | bpf_probe_read(&size, sizeof(u64), &key->map_bh->b_size); 154 | bpf_probe_read(&state, sizeof(u64), &key->map_bh->b_state); 155 | 156 | bdata_t data = { 157 | .b_size = key->b_size, 158 | .b_found_size = size, 159 | .b_state = state, 160 | .b_orig_state = key->b_orig_state, 161 | }; 162 | bevents.perf_submit(ctx, &data, sizeof(data)); 163 | buffers.delete(&pid); 164 | return 0; 165 | } 166 | 167 | int trace_do_mpage_readpage(struct pt_regs *ctx, struct bio *bio, struct page *page, 168 | unsigned nr_pages, sector_t *last_block_in_bio, 169 | struct buffer_head *map_bh, unsigned long *first_logical_block) 170 | { 171 | u64 pid = bpf_get_current_pid_tgid(); 172 | rkey_t *rkey; 173 | 174 | rkey = mappings.lookup(&pid); 175 | if (!rkey) 176 | return 0; 177 | 178 | read_data_t data = { 179 | .max_vecs = bio->bi_max_vecs, 180 | .last_block_in_bio = *last_block_in_bio, 181 | .first_logical_block = *first_logical_block, 182 | }; 183 | unsigned long *ptr = (unsigned long *)((void *)page + offsetof(struct page, index)); 184 | bpf_probe_read(&data.page_index, sizeof(u64), ptr); 185 | readhash.update(&pid, &data); 186 | return 0; 187 | } 188 | 189 | int trace_exit_do_mpage_readpage(struct pt_regs *ctx) 190 | { 191 | read_data_t *data; 192 | u64 pid = bpf_get_current_pid_tgid(); 193 | 194 | data = readhash.lookup(&pid); 195 | if (!data) 196 | return 0; 197 | data->bio_ptr = PT_REGS_RC(ctx); 198 | revents.perf_submit(ctx, data, sizeof(*data)); 199 | readhash.delete(&pid); 200 | return 0; 201 | } 202 | """ 203 | 204 | if debug: 205 | print(bpf_text) 206 | 207 | # load BPF program 208 | b = BPF(text=bpf_text) 209 | b.attach_kprobe(event="xfs_get_blocks", fn_name="trace_get_blocks") 210 | b.attach_kretprobe(event="xfs_get_blocks", fn_name="trace_exit_get_blocks") 211 | b.attach_kprobe(event="mpage_readpages", fn_name="trace_mpage_readpages") 212 | b.attach_kretprobe(event="mpage_readpages", fn_name="trace_mpage_readpages_return") 213 | b.attach_kprobe(event="submit_bio", fn_name="trace_submit_bio") 214 | b.attach_kprobe(event="do_mpage_readpage", fn_name="trace_do_mpage_readpage") 215 | b.attach_kretprobe(event="do_mpage_readpage", fn_name="trace_exit_do_mpage_readpage") 216 | b.attach_kretprobe(event="add_to_page_cache_lru", fn_name="trace_exit_add_to_page_cache_lru") 217 | 218 | class Data(ct.Structure): 219 | _fields_ = [ 220 | ("readpages_size", ct.c_ulonglong), 221 | ("bio_size", ct.c_ulonglong), 222 | ("num_bios", ct.c_ulonglong), 223 | ("page_cache_failures", ct.c_ulonglong), 224 | ] 225 | 226 | class BData(ct.Structure): 227 | _fields_ = [ 228 | ("b_size", ct.c_ulonglong), 229 | ("b_found_size", ct.c_ulonglong), 230 | ("b_state", ct.c_ulonglong), 231 | ("b_orig_state", ct.c_ulonglong), 232 | ] 233 | 234 | class RData(ct.Structure): 235 | _fields_ = [ 236 | ("bio_ptr", ct.c_ulonglong), 237 | ("last_block_in_bio", ct.c_ulonglong), 238 | ("first_logical_block", ct.c_ulonglong), 239 | ("max_vecs", ct.c_ulonglong), 240 | ("page_index", ct.c_ulonglong), 241 | ] 242 | 243 | print("%-14s %-14s %-14s" % ("READPAGES SIZE", "BIO SIZE", "NUM BIOS")) 244 | 245 | def print_data(cpu, data, size): 246 | event = ct.cast(data, ct.POINTER(Data)).contents 247 | 248 | print("%-14s %-14s %-14s %-14s" % (event.readpages_size, event.bio_size, 249 | event.num_bios, event.page_cache_failures)) 250 | 251 | def print_rdata(cpu, data, size): 252 | event = ct.cast(data, ct.POINTER(RData)).contents 253 | 254 | print("\treadpage\t%-14s %-14s %-14s %-14s %-14s" % (event.bio_ptr, event.last_block_in_bio, 255 | event.first_logical_block, event.page_index, event.max_vecs)) 256 | 257 | def print_bdata(cpu, data, size): 258 | event = ct.cast(data, ct.POINTER(BData)).contents 259 | 260 | print("\tget_block\t%-14s %-14s %-14s %-14s" % (event.b_size, event.b_found_size, event.b_state, event.b_orig_state)) 261 | 262 | b["events"].open_perf_buffer(print_data) 263 | b["bevents"].open_perf_buffer(print_bdata) 264 | b["revents"].open_perf_buffer(print_rdata) 265 | while 1: 266 | b.kprobe_poll() 267 | -------------------------------------------------------------------------------- /xfs-hang/inject-error.py: -------------------------------------------------------------------------------- 1 | from bcc import BPF 2 | from time import sleep 3 | from subprocess import Popen 4 | import argparse 5 | import sys 6 | 7 | bpf_text = """ 8 | #include 9 | #include 10 | 11 | BPF_HASH(fail_pids, u64); 12 | BPF_HASH(trigger_pid, u64); 13 | BPF_HASH(fail_bufs, u64); 14 | 15 | int trigger_function(struct pt_regs *ctx) 16 | { 17 | u64 pid = bpf_get_current_pid_tgid(); 18 | u64 val = 1; 19 | trigger_pid.update(&pid, &val); 20 | return 0; 21 | } 22 | 23 | int trace_xfs_trans_log_buf(struct pt_regs *ctx, void *ptr, void *buf) 24 | { 25 | u64 pid = bpf_get_current_pid_tgid(); 26 | u64 *val; 27 | u64 ins = (u64)buf; 28 | 29 | val = trigger_pid.lookup(&pid); 30 | if (!val) 31 | return 0; 32 | 33 | fail_bufs.update(&ins, &pid); 34 | return 0; 35 | } 36 | 37 | int trace_xfs_buf_ioapply_map(struct pt_regs *ctx, void *buf, int map, int *offset, int *count, int op) 38 | { 39 | u64 pid = bpf_get_current_pid_tgid(); 40 | u64 search = (u64)buf; 41 | u64 *val; 42 | 43 | if (op != REQ_OP_WRITE) 44 | return 0; 45 | 46 | val = fail_bufs.lookup(&search); 47 | if (!val) 48 | return 0; 49 | bpf_trace_printk("Heeeey this worked\\n"); 50 | fail_pids.update(&pid, &search); 51 | return 0; 52 | } 53 | 54 | int trace_xfs_buf_ioapply_map_ret(struct pt_regs *ctx) 55 | { 56 | u64 pid = bpf_get_current_pid_tgid(); 57 | fail_pids.delete(&pid); 58 | return 0; 59 | } 60 | 61 | int trigger_function_ret(struct pt_regs *ctx) 62 | { 63 | u64 pid = bpf_get_current_pid_tgid(); 64 | trigger_pid.delete(&pid); 65 | return 0; 66 | } 67 | 68 | int override_function(struct pt_regs *ctx) 69 | { 70 | u64 pid = bpf_get_current_pid_tgid(); 71 | u64 *val; 72 | 73 | val = fail_pids.lookup(&pid); 74 | if (!val) 75 | return 0; 76 | 77 | bpf_trace_printk("overrding something\\n"); 78 | unsigned long rc = RCVAL; 79 | bpf_override_return(ctx, rc); 80 | return 0; 81 | } 82 | """ 83 | 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument("-o", "--override", help="The function to override") 86 | parser.add_argument("-r", "--retval", type=int, help="The return value to use") 87 | parser.add_argument("-t", "--trigger", 88 | help="The function that must be called to trigger the error injection") 89 | parser.add_argument("-d", "--delay", type=int, 90 | help="The delay to wait before injecting the error") 91 | parser.add_argument("-T", "--timeout", type=int, 92 | help="Timeout after error injection has been loaded to wait on the task") 93 | parser.add_argument("COMMAND", nargs='+', help="The command to run") 94 | 95 | args = parser.parse_args() 96 | retval = -12 97 | 98 | if not args.override: 99 | print("Must specify an override function") 100 | sys.exit(1) 101 | if not args.trigger: 102 | print("Must specify a function as the trigger function") 103 | sys.exit(1) 104 | if args.retval: 105 | retval = args.retval 106 | 107 | bpf_text = bpf_text.replace("RCVAL", str(retval)) 108 | 109 | print("Running command") 110 | p = Popen(args.COMMAND) 111 | if args.delay: 112 | print("Sleeping for {} seconds".format(args.delay)) 113 | sleep(args.delay) 114 | 115 | print("Loading error injection") 116 | b = BPF(text=bpf_text) 117 | 118 | # Load the kretprobe first, because we want the delete guy to be in place before 119 | # the add guy is in place, otherwise we could error out pids that are no longer 120 | # in our path and cause unfortunate things to happen. 121 | b.attach_kretprobe(event=args.trigger, fn_name="trigger_function_ret") 122 | b.attach_kprobe(event=args.trigger, fn_name="trigger_function") 123 | b.attach_kprobe(event="xfs_trans_log_buf", fn_name="trace_xfs_trans_log_buf") 124 | b.attach_kretprobe(event="xfs_buf_ioapply_map", fn_name="trace_xfs_buf_ioapply_map_ret") 125 | b.attach_kprobe(event="xfs_buf_ioapply_map", fn_name="trace_xfs_buf_ioapply_map") 126 | #b.attach_kprobe(event='submit_bio', fn_name="trace_submit_bio") 127 | #b.attach_kprobe(event='generic_make_request_checks', fn_name="trace_generic_make_request_checks") 128 | b.attach_kprobe(event=args.override, fn_name="override_function") 129 | 130 | print("Dropping caches") 131 | f = open("/proc/sys/vm/drop_caches", "w") 132 | f.write("3") 133 | f.close() 134 | 135 | print("Waiting for the command to exit") 136 | while p.poll() is None: 137 | if args.timeout: 138 | sleep(args.timeout) 139 | if p.poll() is None: 140 | print("Killing the task, it didn't die") 141 | f = open("nofail.txt", "a") 142 | f.write(args.trigger + "\n") 143 | f.close() 144 | p.kill() 145 | p.wait() 146 | break 147 | p.wait() 148 | 149 | # We have to remove in this order otherwise we could end up with a half 150 | # populated hasmap and overrding legitimate things. 151 | b.detach_kprobe(args.override) 152 | b.detach_kprobe(args.trigger) 153 | b.detach_kretprobe(args.trigger) 154 | print("Exiting") 155 | -------------------------------------------------------------------------------- /xfs-hang/reproducer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in $(cat xfs-log-paths.txt) 4 | do 5 | func=$(grep " $i" /proc/kallsyms | awk '{ print $3 }') 6 | [ "$func" == "" ] && continue 7 | echo "testing $func"; 8 | python inject-error.py -t $func -o should_fail_bio -r 1 -d 20 -T 20 ./test.sh 9 | pkill -9 fsstress 10 | while [ $(lsof /mnt/test | wc -l) -gt 0 ] 11 | do 12 | sleep 10 13 | done 14 | umount /mnt/test 15 | done 16 | -------------------------------------------------------------------------------- /xfs-hang/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkfs.xfs -Kf /dev/nvme0n1 4 | mount /dev/nvme0n1 /mnt/test 5 | ~/xfstests/ltp/fsstress -d /mnt/test -n 10000 -p 16 -l 0 -fattr_set=1 -fattr_remove=1 6 | -------------------------------------------------------------------------------- /xfs-hang/xfs-log-paths.txt: -------------------------------------------------------------------------------- 1 | xfs_attr3_leaf_flipflags 2 | xfs_btree_log_recs 3 | xfs_btree_log_ptrs 4 | xfs_da3_node_create 5 | xfs_da3_node_rebalance 6 | xfs_da3_node_add 7 | xfs_da3_fixhashpath 8 | xfs_da3_node_remove 9 | xfs_da3_node_unbalance 10 | xfs_da3_blk_link 11 | xfs_da3_blk_unlink 12 | xfs_da3_swap_lastblock 13 | xfs_dir2_block_log_leaf 14 | xfs_dir2_block_log_tail 15 | xfs_dir3_leaf_log_bests 16 | xfs_dir3_leaf_log_tail 17 | xfs_dir2_free_log_bests 18 | xfs_dir2_free_log_header 19 | xfs_rtmodify_range 20 | xfs_symlink_local_to_remote 21 | xfs_qm_init_dquot_blk 22 | xfs_iunlink_remove 23 | xfs_allocbt_set_root 24 | xfs_allocbt_update_lastrec 25 | xfs_refcountbt_set_root 26 | xfs_rmapbt_set_root 27 | xfs_attr3_leaf_rebalance 28 | xfs_btree_new_root 29 | xfs_btree_block_change_owner 30 | xfs_dir2_block_replace 31 | xfs_dir2_leaf_replace 32 | xfs_dir2_node_replace 33 | xfs_dir3_data_init 34 | xfs_dir2_data_use_free 35 | xfs_dir2_leafn_add 36 | xfs_dir3_leaf_get_buf 37 | xfs_inobt_set_root 38 | xfs_finobt_set_root 39 | xfs_qm_log_quotaoff 40 | xfs_droplink 41 | xfs_free_ag_extent 42 | xfs_allocbt_alloc_block 43 | xfs_rmapbt_alloc_block 44 | xfs_allocbt_free_block 45 | xfs_rmapbt_free_block 46 | xfs_bmap_add_attrfork_btree 47 | xfs_dir2_block_compact 48 | xfs_dir2_leafn_unbalance 49 | xfs_dir2_leafn_rebalance 50 | xfs_rtcopy_summary 51 | xfs_rtany_summary 52 | xfs_attr_shortform_remove 53 | xfs_log_worker 54 | xlog_recover_process_iunlinks 55 | xfs_alloc_ag_vextent_near 56 | xfs_alloc_ag_vextent_size 57 | xfs_btree_rshift 58 | xfs_rtfree_extent 59 | xfs_rtallocate_extent_exact 60 | xfs_attr_shortform_addname 61 | xfs_qm_quotacheck 62 | xfs_quota_disable 63 | xfs_quota_enable 64 | xfs_update_prealloc_flags 65 | xfs_finish_rename 66 | xfs_ioc_setxflags 67 | xfs_vn_update_time 68 | xfs_qm_log_quotaoff_end 69 | xfs_reflink_set_inode_flag 70 | xfs_reflink_update_dest 71 | xfs_bmap_remap_alloc 72 | xfs_bmap_del_extent 73 | xfs_dialloc_ag 74 | xfs_difree_finobt 75 | xfs_rmap_convert 76 | xfs_alloc_update 77 | xfs_bmbt_update 78 | xfs_rmap_update 79 | xfs_btree_delrec 80 | xfs_btree_split_worker 81 | xfs_btree_split 82 | xfs_rtallocate_extent_size 83 | xfs_fs_put_super 84 | xfs_fs_remount 85 | xfs_fs_freeze 86 | xfs_reflink_recover_cow 87 | xfs_setfilesize_ioend 88 | xfs_ioc_swapext 89 | xfs_fssetdm_by_handle 90 | xfs_compat_fssetdm_by_handle 91 | xfs_inode_free_cowblocks 92 | xfs_reflink_unshare 93 | xfs_bmbt_alloc_block 94 | xfs_refcountbt_alloc_block 95 | xfs_alloc_ag_vextent_exact 96 | xfs_refcount_merge_left_extent 97 | xfs_refcount_merge_right_extent 98 | xfs_refcount_adjust_extents 99 | xfs_refcount_adjust_cow_extents 100 | xfs_rmap_convert_shared 101 | xfs_rmap_unmap_shared 102 | xfs_rmap_map_shared 103 | xfs_rmap_free 104 | xfs_rmap_alloc 105 | xfs_inobt_insert_sprec 106 | xfs_dialloc_ag_inobt 107 | xfs_dialloc_ag_update_inobt 108 | xfs_refcount_split_extent 109 | xfs_bmap_add_extent_hole_real 110 | xfs_bmap_alloc 111 | xfs_inobt_alloc_block 112 | xfs_finobt_alloc_block 113 | xfs_inobt_free_block 114 | xfs_refcountbt_free_block 115 | xfs_bmap_shift_extents 116 | xfs_bmap_rtalloc 117 | xfs_file_write_iter 118 | xfs_swap_extent_rmap 119 | xfs_bmapi_convert_unwritten 120 | xfs_bmap_split_extent 121 | xfs_growfs_data 122 | xfs_extent_free_finish_item 123 | xfs_rmap_update_finish_item 124 | xfs_inactive_ifree 125 | xfs_refcount_insert 126 | xfs_rmap_insert 127 | xfs_reflink_clear_inode_flag 128 | xfs_attr3_root_inactive 129 | xfs_reflink_end_cow 130 | xlog_recover_process_efi 131 | xlog_recover_process_rui 132 | __xfs_refcount_cow_alloc 133 | __xfs_refcount_cow_free 134 | xfs_inobt_insert 135 | xfs_initxattrs 136 | xfs_xattr_set 137 | xfs_unmap_extent 138 | xfs_compat_attrmulti_by_handle 139 | xfs_attr_leaf_addname 140 | xfs_attr_node_addname 141 | xfs_end_io 142 | xfs_dio_write_end_io 143 | xfs_fs_commit_blocks 144 | xfs_growfs_rt 145 | xfs_attr_node_removename 146 | xfs_da3_root_join 147 | xfs_dir2_leaf_trim_data 148 | xfs_dir2_node_trim_free 149 | xfs_aops_discard_page 150 | xfs_attr_inactive 151 | xfs_inactive_truncate 152 | xfs_inactive_symlink 153 | xfs_qm_init_quotainos 154 | xfs_refcount_update_finish_item 155 | xfs_bmap_update_finish_item 156 | xfs_attr_shortform_to_leaf 157 | xfs_attr3_leaf_to_node 158 | xfs_da3_root_split 159 | xfs_da3_node_split 160 | xfs_dir2_block_to_leaf 161 | xfs_dir2_leafn_split 162 | xfs_dir2_leaf_to_node 163 | xfs_dir2_node_addname_int 164 | xfs_file_fallocate 165 | xfs_ioc_space 166 | xfs_attr_leaf_removename 167 | xfs_dir2_leaf_removename 168 | xfs_dir2_node_to_leaf 169 | xfs_dir2_leafn_remove 170 | xfs_dir2_block_removename 171 | xfs_file_iomap_end 172 | xfs_file_clone_range 173 | xfs_file_dedupe_range 174 | xlog_recover_process_cui 175 | xlog_do_recover 176 | xfs_vn_tmpfile 177 | xfs_file_compat_ioctl 178 | xfs_bmap_add_attrfork_local 179 | xfs_vm_writepages 180 | xfs_dir_removename 181 | xfs_fs_rm_xquota 182 | xfs_vn_create 183 | xfs_vn_mkdir 184 | xfs_cross_rename 185 | xfs_qm_dqusage_adjust 186 | xfs_fs_statfs 187 | xfs_fs_set_info 188 | xfs_fs_set_dqblk 189 | xfs_fs_get_dqblk 190 | xfs_fs_get_nextdqblk 191 | xfs_log_mount 192 | xfs_reflink_reserve_cow 193 | xfs_reflink_allocate_cow 194 | xfs_qm_newmount 195 | xfs_create 196 | xfs_dir2_sf_addname 197 | xfs_file_iomap_begin 198 | xfs_free_file_space 199 | xfs_qm_vop_rename_dqattach 200 | xfs_fs_fill_super 201 | xfs_vn_link 202 | xfs_vn_rename 203 | xfs_vn_symlink 204 | xfs_attrmulti_attr_remove 205 | xfs_inode_free_eofblocks 206 | xfs_cleanup_inode 207 | xfs_vn_unlink 208 | xfs_fs_map_blocks 209 | xfs_map_cow 210 | xfs_rename_alloc_whiteout 211 | xfs_ioc_fssetxattr 212 | xfs_set_mode 213 | xfs_vn_setattr_nonsize 214 | xfs_collapse_file_space 215 | xfs_insert_file_space 216 | xfs_file_release 217 | xfs_fs_destroy_inode 218 | xfs_vn_setattr 219 | xfs_vm_writepage 220 | --------------------------------------------------------------------------------