├── tests ├── no-trace-bench.txt ├── no-trace.txt ├── do-trace.txt ├── untraced_catcher.cpp ├── asm_filter_2.cpp ├── freq.cpp ├── lib_shared.cpp ├── lib_dyn_shared.cpp ├── sigtrap.cpp ├── tailcall.cpp ├── c.c ├── test.h ├── untraced_funcs.cpp ├── count_shared.cpp ├── count_dyn_shared.cpp ├── shared.cpp ├── killed.cpp ├── buf_size.cpp ├── ftrace.cpp ├── asm_filter.cpp ├── count.cpp ├── ignore_disable.cpp ├── longjmp.cpp ├── exceptions.cpp ├── benchmark.cpp └── orphans.cpp ├── .gitignore ├── images └── krita-trace.png ├── Cargo.toml ├── funcount2sym ├── Cargo.toml └── src │ └── main.rs ├── procaddr2sym ├── Cargo.toml └── src │ └── lib.rs ├── simple-example ├── shared.cpp ├── run.sh ├── test.cpp └── build.sh ├── release.sh ├── funtrace2viz ├── Cargo.toml └── src │ └── main.rs ├── funtrace.dyn ├── compiler-wrappers ├── xray │ └── ld ├── funtrace-finstr-clang++ ├── funtrace-xray-clang++ ├── funtrace-pg-g++ ├── funtrace-finstr-g++ └── funtrace++ ├── LICENSE.txt ├── funtrace_flags.h ├── fun_xray_so.S ├── funcount_pg.S ├── funtrace.h ├── funtrace_gdb.py ├── funtrace_pg.S ├── funcount.cpp ├── tests.py └── README.md /tests/no-trace-bench.txt: -------------------------------------------------------------------------------- 1 | trace_filtered 2 | -------------------------------------------------------------------------------- /tests/no-trace.txt: -------------------------------------------------------------------------------- 1 | _Z20long_but_blacklistedv 2 | -------------------------------------------------------------------------------- /tests/do-trace.txt: -------------------------------------------------------------------------------- 1 | _Z21short_but_whitelistedv 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Cargo.lock 2 | target/ 3 | built-tests/* 4 | out/* 5 | .*.swp 6 | -------------------------------------------------------------------------------- /tests/untraced_catcher.cpp: -------------------------------------------------------------------------------- 1 | #define UNTRACED_CATCHER 2 | #include "exceptions.cpp" 3 | -------------------------------------------------------------------------------- /images/krita-trace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yosefk/funtrace/HEAD/images/krita-trace.png -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["procaddr2sym", "funcount2sym", "funtrace2viz"] 3 | resolver = "2" 4 | -------------------------------------------------------------------------------- /tests/asm_filter_2.cpp: -------------------------------------------------------------------------------- 1 | //the same test as asm_filter but compiled with different -funtrace-* flags - we want to see that we get a different trace that way 2 | #include "asm_filter.cpp" 3 | -------------------------------------------------------------------------------- /funcount2sym/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "funcount2sym" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | procaddr2sym = { path = "../procaddr2sym" } 10 | -------------------------------------------------------------------------------- /procaddr2sym/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "procaddr2sym" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | addr2line = "0.20" 8 | chrono = "0.4.39" 9 | cpp_demangle = "0.4.4" 10 | goblin = "0.9.2" 11 | memmap2 = "0.9.5" 12 | procfs = "0.17.0" 13 | serde_json = "1.0.134" 14 | -------------------------------------------------------------------------------- /tests/freq.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include 3 | 4 | volatile int n=0; 5 | 6 | void NI usleep_1500() 7 | { 8 | usleep(1500); 9 | n++; 10 | } 11 | 12 | int main() 13 | { 14 | //test that we convert TSC to us correctly 15 | scope_tracer tracer; 16 | usleep_1500(); 17 | } 18 | -------------------------------------------------------------------------------- /simple-example/shared.cpp: -------------------------------------------------------------------------------- 1 | 2 | volatile int glob; 3 | 4 | void __attribute__((noinline)) shared_f(int n) 5 | { 6 | glob = n; 7 | } 8 | 9 | void __attribute__((noinline)) shared_g(int a1, int a2, int a3, int a4, int a5, int a6) 10 | { 11 | shared_f(a1+a2+a3+a4+a5+a6); 12 | shared_f(a1*a2*a3*a4*a5*a6); 13 | } 14 | -------------------------------------------------------------------------------- /tests/lib_shared.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | volatile int shared_n; 4 | 5 | void NI f_shared() 6 | { 7 | shared_n++; 8 | } 9 | 10 | void NI g_shared() 11 | { 12 | f_shared(); 13 | f_shared(); 14 | shared_n++; 15 | } 16 | 17 | void NI h_shared() 18 | { 19 | g_shared(); 20 | f_shared(); 21 | shared_n++; 22 | } 23 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | cd .. 4 | rm -f funtrace/funtrace.zip 5 | zip funtrace/funtrace.zip funtrace/README.md funtrace/funtrace.cpp funtrace/funcount.cpp funtrace/funtrace.h funtrace/funtrace_flags.h funtrace/*.S funtrace/funtrace.dyn \ 6 | funtrace/target/x86_64-unknown-linux-gnu/release/{funcount2sym,funtrace2viz} funtrace/compiler-wrappers/* funtrace/compiler-wrappers/xray/* funtrace/simple-example/* 7 | -------------------------------------------------------------------------------- /funtrace2viz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "funtrace2viz" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | procaddr2sym = { path = "../procaddr2sym" } 10 | bytemuck = { version="1.20.0", features = ["derive"] } 11 | serde_json = "1.0.133" 12 | clap = { version = "3.0", features = ["derive"] } 13 | num = "0.4.3" 14 | -------------------------------------------------------------------------------- /funtrace.dyn: -------------------------------------------------------------------------------- 1 | { 2 | funtrace_pause_and_write_current_snapshot; 3 | funtrace_pause_and_get_snapshot; 4 | funtrace_time; 5 | funtrace_ticks_per_second; 6 | funtrace_pause_and_get_snapshot_starting_at_time; 7 | funtrace_pause_and_get_snapshot_up_to_age; 8 | funtrace_free_snapshot; 9 | funtrace_write_snapshot; 10 | funtrace_ignore_this_thread; 11 | funtrace_set_thread_log_buf_size; 12 | funtrace_disable_tracing; 13 | funtrace_enable_tracing; 14 | }; 15 | -------------------------------------------------------------------------------- /tests/lib_dyn_shared.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | volatile int dyn_shared_n; 4 | 5 | void NI f_dyn_shared() 6 | { 7 | dyn_shared_n++; 8 | } 9 | 10 | void NI g_dyn_shared() 11 | { 12 | f_dyn_shared(); 13 | f_dyn_shared(); 14 | dyn_shared_n++; 15 | } 16 | 17 | void NI h_dyn_shared() 18 | { 19 | g_dyn_shared(); 20 | f_dyn_shared(); 21 | dyn_shared_n++; 22 | } 23 | 24 | extern "C" void NI h_dyn_shared_c() 25 | { 26 | h_dyn_shared(); 27 | dyn_shared_n++; 28 | } 29 | -------------------------------------------------------------------------------- /tests/sigtrap.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include 3 | #include 4 | #include 5 | 6 | volatile int n; 7 | 8 | void NI traced_func() 9 | { 10 | n++; 11 | } 12 | 13 | void NI traced_thread() 14 | { 15 | while(true) { 16 | traced_func(); 17 | n++; 18 | } 19 | } 20 | 21 | int main() 22 | { 23 | funtrace_ignore_this_thread(); 24 | 25 | std::thread t(traced_thread); 26 | t.detach(); 27 | 28 | while(n < 100); 29 | 30 | kill(getpid(), SIGTRAP); 31 | } 32 | -------------------------------------------------------------------------------- /tests/tailcall.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | volatile int n; 4 | 5 | void NI callee() 6 | { 7 | n++; 8 | }; 9 | 10 | void NI tail_caller() 11 | { 12 | n++; 13 | callee(); 14 | } 15 | 16 | void NI NOFUNTRACE callee_untraced() 17 | { 18 | n++; 19 | } 20 | 21 | void NI tail_caller_untraced() 22 | { 23 | n++; 24 | callee_untraced(); 25 | } 26 | 27 | int main() 28 | { 29 | scope_tracer tracer; 30 | for(int i=0; i<3; ++i) { 31 | tail_caller(); 32 | tail_caller_untraced(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /tests/c.c: -------------------------------------------------------------------------------- 1 | #include "funtrace.h" 2 | 3 | #define NI __attribute__((noinline)) 4 | 5 | volatile int n; 6 | 7 | void NI f() 8 | { 9 | n++; 10 | } 11 | 12 | void NI g() 13 | { 14 | f(); 15 | n++; 16 | f(); 17 | n++; 18 | } 19 | 20 | int main() 21 | { 22 | uint64_t start = funtrace_time(); 23 | 24 | g(); 25 | 26 | funtrace_snapshot* snapshot = funtrace_pause_and_get_snapshot_starting_at_time(start); 27 | funtrace_write_snapshot("funtrace.raw", snapshot); 28 | funtrace_free_snapshot(snapshot); 29 | } 30 | -------------------------------------------------------------------------------- /tests/test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "funtrace.h" 4 | 5 | #define NI __attribute__((noinline)) 6 | 7 | struct scope_tracer 8 | { 9 | uint64_t start_time = 0; 10 | const char* fname = nullptr; 11 | 12 | NOFUNTRACE scope_tracer(const char* f="funtrace.raw") 13 | { 14 | fname = f; 15 | start_time = funtrace_time(); 16 | } 17 | 18 | NOFUNTRACE ~scope_tracer() 19 | { 20 | funtrace_snapshot* snapshot = funtrace_pause_and_get_snapshot_starting_at_time(start_time); 21 | funtrace_write_snapshot(fname, snapshot); 22 | funtrace_free_snapshot(snapshot); 23 | }; 24 | }; 25 | -------------------------------------------------------------------------------- /tests/untraced_funcs.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | volatile int n; 4 | 5 | inline void NOFUNTRACE nop() {} 6 | 7 | #define UNTRACED(name, callee) void NI NOFUNTRACE name() { n++; callee(); n++; } 8 | #define TRACED(name, callee) void NI name() { n++; callee(); n++; } 9 | 10 | UNTRACED(un1, nop); 11 | TRACED(tr1, un1); 12 | UNTRACED(un2, tr1); 13 | TRACED(tr2, un2); 14 | 15 | UNTRACED(un3, nop); 16 | UNTRACED(un4, un3); 17 | TRACED(tr3, un4); 18 | TRACED(tr4, tr3); 19 | UNTRACED(un5, tr4); 20 | UNTRACED(un6, un5); 21 | 22 | int main() 23 | { 24 | scope_tracer tracer; 25 | 26 | tr2(); 27 | un2(); 28 | 29 | un6(); 30 | tr4(); 31 | } 32 | -------------------------------------------------------------------------------- /tests/count_shared.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | volatile int shared_n; 4 | 5 | //we want the libraries to be loaded far apart to make sure 6 | //funcount actually finds the newly mapped executable segments 7 | //as opposed to "being lucky" with them mapped where it already 8 | //has pages in its page table 9 | char buf[256*1024]={1}; 10 | 11 | struct glob 12 | { 13 | glob() { shared_n++; } 14 | } gg; 15 | 16 | void NI f_shared() 17 | { 18 | shared_n++; 19 | } 20 | 21 | void NI g_shared() 22 | { 23 | f_shared(); 24 | shared_n++; 25 | f_shared(); 26 | } 27 | 28 | void NI h_shared() 29 | { 30 | g_shared(); 31 | shared_n++; 32 | f_shared(); 33 | } 34 | -------------------------------------------------------------------------------- /simple-example/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -ex 4 | 5 | ./out/test_trace.fi-gcc 6 | ./target/x86_64-unknown-linux-gnu/release/funtrace2viz funtrace.raw out/funtrace-fi-gcc 7 | rm funtrace.raw 8 | 9 | ./out/test_trace.pg 10 | ./target/x86_64-unknown-linux-gnu/release/funtrace2viz funtrace.raw out/funtrace-pg 11 | rm funtrace.raw 12 | 13 | ./out/test_trace.fi-clang 14 | ./target/x86_64-unknown-linux-gnu/release/funtrace2viz funtrace.raw out/funtrace-fi-clang 15 | rm funtrace.raw 16 | 17 | if [ -e ./out/test_trace.xray ]; then 18 | env XRAY_OPTIONS="patch_premain=true" ./out/test_trace.xray 19 | ./target/x86_64-unknown-linux-gnu/release/funtrace2viz funtrace.raw out/funtrace-xray 20 | rm funtrace.raw 21 | fi 22 | -------------------------------------------------------------------------------- /tests/count_dyn_shared.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | volatile int dyn_shared_n; 4 | 5 | //we want the libraries to be loaded far apart to make sure 6 | //funcount actually finds the newly mapped executable segments 7 | //as opposed to "being lucky" with them mapped where it already 8 | //has pages in its page table 9 | char buf_shared[256*1024]={1}; 10 | 11 | struct glob_dyn 12 | { 13 | glob_dyn() { dyn_shared_n++; } 14 | } gg_dyn; 15 | 16 | void NI f_dyn_shared() 17 | { 18 | dyn_shared_n++; 19 | } 20 | 21 | void NI g_dyn_shared() 22 | { 23 | f_dyn_shared(); 24 | dyn_shared_n++; 25 | f_dyn_shared(); 26 | } 27 | 28 | void NI h_dyn_shared() 29 | { 30 | g_dyn_shared(); 31 | dyn_shared_n++; 32 | f_dyn_shared(); 33 | } 34 | 35 | extern "C" void NI h_dyn_shared_c() 36 | { 37 | h_dyn_shared(); 38 | } 39 | -------------------------------------------------------------------------------- /simple-example/test.cpp: -------------------------------------------------------------------------------- 1 | #include "funtrace.h" 2 | #include 3 | #include 4 | 5 | #define NL __attribute__((noinline)) 6 | 7 | volatile int n; 8 | 9 | NL void f(int i) 10 | { 11 | n = i; 12 | } 13 | 14 | void NL g(int i) 15 | { 16 | f(i); 17 | } 18 | 19 | void NL h(int i) { 20 | g(i); 21 | g(i); 22 | } 23 | 24 | volatile int done = 0; 25 | 26 | void shared_g(int a1, int a2, int a3, int a4, int a5, int a6); 27 | 28 | int main() 29 | { 30 | std::thread t([]{ 31 | pthread_setname_np(pthread_self(), "child"); 32 | for(int i=0; i<100000; ++i) { 33 | h(1); 34 | } 35 | }); 36 | for(int i=0; i<100000; ++i) { 37 | g(2); 38 | shared_g(1,2,3,4,5,6); 39 | } 40 | t.join(); 41 | 42 | funtrace_pause_and_write_current_snapshot(); 43 | } 44 | -------------------------------------------------------------------------------- /compiler-wrappers/xray/ld: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import sys 3 | import os 4 | import subprocess 5 | args = sys.argv[1:] 6 | 7 | # this linker wrapper lets us override XRay's functions like __xray_FunctionEntry; 8 | # for this it puts the XRay runtime libraries after our runtime code, and passes 9 | # --allow-multiple-definition 10 | 11 | start = None 12 | for i,arg in enumerate(args): 13 | if arg == '--whole-archive': 14 | start = i 15 | elif arg == '--no-whole-archive' and start is not None: 16 | end = i+1 17 | break 18 | elif 'xray' not in arg: 19 | start = None 20 | 21 | if start is not None: 22 | xraylibs = args[start:end] 23 | args = args[:start] + args[end:] 24 | stdlibspos = args.index('-lc') 25 | args = args[:stdlibspos] + xraylibs + args[stdlibspos:] 26 | 27 | args += ['--allow-multiple-definition'] 28 | 29 | ldpath = subprocess.getoutput('which ld') 30 | os.execl(ldpath, *([ldpath] + args)) 31 | -------------------------------------------------------------------------------- /tests/shared.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "test.h" 8 | 9 | volatile int n; 10 | 11 | void NI f() 12 | { 13 | n++; 14 | } 15 | 16 | void NI g() 17 | { 18 | f(); 19 | f(); 20 | n++; 21 | } 22 | 23 | void NI h() 24 | { 25 | g(); 26 | f(); 27 | n++; 28 | } 29 | 30 | void h_shared(); 31 | void (*h_shared_2)(); 32 | 33 | const int64_t iters = 3; 34 | 35 | void NI loop() 36 | { 37 | for(int64_t i=0; i 3 | #include 4 | #include 5 | #include 6 | 7 | volatile int n; 8 | 9 | void NI f() 10 | { 11 | n++; 12 | } 13 | 14 | void NI g() 15 | { 16 | f(); 17 | n++; 18 | } 19 | 20 | void NOFUNTRACE child_inf() 21 | { 22 | g(); 23 | while(1); 24 | } 25 | 26 | void NOFUNTRACE child_fin() 27 | { 28 | pthread_setname_np(pthread_self(), "child"); 29 | g(); 30 | usleep(150*1000); //to get ftrace events 31 | for(volatile int i=0; i<1000000000; ++i); 32 | } 33 | 34 | int main() 35 | { 36 | { 37 | scope_tracer empty; 38 | //just so funtrace.raw is created 39 | } 40 | g(); 41 | 42 | std::thread t1(child_inf); 43 | std::thread t2(child_fin); 44 | 45 | t2.join(); 46 | 47 | //this will leave an ftrace tracer instance that we want some other run 48 | //of a funtrace-instrumented program to collect 49 | abort(); 50 | } 51 | -------------------------------------------------------------------------------- /tests/buf_size.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include 3 | #include 4 | 5 | volatile int n = 0; 6 | 7 | void NI f() 8 | { 9 | n++; 10 | } 11 | 12 | int main() 13 | { 14 | scope_tracer tracer; 15 | 16 | //this incidentally tests garbage collection (the thread dies by the time 17 | //the scope tracer is destroyed and we check that we get both threads' traces) 18 | //in addition to checking that we can set per-thread buffer sizes 19 | std::thread t([] { 20 | funtrace_set_thread_log_buf_size(5+4); 21 | pthread_setname_np(pthread_self(), "event_buf_16"); 22 | //check that only 16 function calls out of these 100 are logged into the small buffer 23 | for(int i=0; i<100; ++i) { 24 | f(); 25 | } 26 | }); 27 | 28 | funtrace_set_thread_log_buf_size(5); 29 | pthread_setname_np(pthread_self(), "event_buf_1"); 30 | //check that only one function call out of these 100 is logged into the small buffer 31 | for(int i=0; i<100; ++i) { 32 | f(); 33 | } 34 | t.join(); 35 | } 36 | -------------------------------------------------------------------------------- /tests/ftrace.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include 3 | #include 4 | #include 5 | 6 | void NI spin() 7 | { 8 | volatile int n=0; 9 | for(n=0; n<100000000; ++n); 10 | } 11 | 12 | volatile int n = 0; 13 | 14 | void NI sleep() 15 | { 16 | usleep(150*1000); 17 | n++; 18 | } 19 | 20 | void NI child() 21 | { 22 | pthread_setname_np(pthread_self(), "child"); 23 | spin(); 24 | sleep(); 25 | spin(); 26 | } 27 | 28 | void NI parent() 29 | { 30 | spin(); 31 | sleep(); 32 | spin(); 33 | } 34 | 35 | int main() 36 | { 37 | //the trouble with ftrace is that there's no guarantee on event 38 | //delivery latency from the kernel to the userspace, so when you 39 | //take a snapshot, you might be missing some events; our sleeping 40 | //and busy loops are hopefully long enough for events to be consistently 41 | //observed when testing 42 | scope_tracer tracer; 43 | 44 | pthread_setname_np(pthread_self(), "parent"); 45 | 46 | std::thread t(child); 47 | parent(); 48 | 49 | t.join(); 50 | } 51 | -------------------------------------------------------------------------------- /tests/asm_filter.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | volatile int n; 4 | 5 | void NI short_function() 6 | { 7 | n++; 8 | } 9 | 10 | void NI short_but_whitelisted() 11 | { 12 | n++; 13 | } 14 | 15 | void NI long_enough_function() 16 | { 17 | short_function(); 18 | n++; 19 | short_function(); 20 | n++; 21 | short_function(); 22 | n++; 23 | short_function(); 24 | n++; 25 | short_function(); 26 | n++; 27 | short_function(); 28 | n++; 29 | short_function(); 30 | n++; 31 | short_function(); 32 | n++; 33 | short_function(); 34 | n++; 35 | short_function(); 36 | n++; 37 | short_function(); 38 | } 39 | 40 | void NI long_but_blacklisted() 41 | { 42 | short_function(); 43 | n++; 44 | short_function(); 45 | n++; 46 | short_function(); 47 | } 48 | 49 | void NI short_with_loop() 50 | { 51 | while(!n); 52 | } 53 | 54 | int main() 55 | { 56 | scope_tracer tracer; 57 | 58 | short_function(); 59 | short_but_whitelisted(); 60 | long_enough_function(); 61 | long_but_blacklisted(); 62 | short_with_loop(); 63 | } 64 | -------------------------------------------------------------------------------- /tests/count.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "test.h" 8 | 9 | volatile int n; 10 | 11 | void NI f() 12 | { 13 | n++; 14 | } 15 | 16 | void NI g() 17 | { 18 | f(); 19 | n++; 20 | f(); 21 | } 22 | 23 | void NI h() 24 | { 25 | g(); 26 | n++; 27 | f(); 28 | } 29 | 30 | void h_shared(); 31 | void (*h_shared_2)(); 32 | 33 | const int64_t iters = 1000; 34 | 35 | void loop() 36 | { 37 | for(int64_t i=0; i 3 | 4 | volatile int n = 0; 5 | 6 | void NI should_be_traced() { n++; } 7 | //shouldn't be traced since it's called from an ignored thread 8 | void NI shouldnt_be_traced() { n++; } 9 | 10 | const char* g_child_name = "none"; 11 | 12 | void NI traced_thread() 13 | { 14 | n++; 15 | pthread_setname_np(pthread_self(), g_child_name); 16 | should_be_traced(); 17 | n++; 18 | } 19 | 20 | void NI ignored_thread() 21 | { 22 | n++; 23 | shouldnt_be_traced(); 24 | n++; 25 | funtrace_ignore_this_thread(); 26 | shouldnt_be_traced(); 27 | n++; 28 | } 29 | 30 | void run_threads() 31 | { 32 | std::thread t1(traced_thread); 33 | std::thread t2(ignored_thread); 34 | should_be_traced(); 35 | t1.join(); 36 | t2.join(); 37 | } 38 | 39 | int main() 40 | { 41 | pthread_setname_np(pthread_self(), "main"); 42 | scope_tracer tracer; 43 | 44 | g_child_name = "child1"; 45 | run_threads(); 46 | 47 | funtrace_disable_tracing(); 48 | g_child_name = "child2"; 49 | run_threads(); 50 | 51 | funtrace_enable_tracing(); 52 | g_child_name = "child3"; 53 | run_threads(); 54 | } 55 | -------------------------------------------------------------------------------- /compiler-wrappers/funtrace-finstr-clang++: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import sys 3 | import os 4 | import subprocess 5 | linking = '-c' not in sys.argv and '-S' not in sys.argv and '-E' not in sys.argv 6 | args = sys.argv[1:] 7 | 8 | # clang doesn't have -finstrument-functions-exclude-file-list so you can't exclude header files 9 | # easily; it does however have -finstrument-functions-after-inlining which is a good default 10 | # in general and especially in the absence of -finstrument-functions-exclude-file-list. 11 | # you can use -finstrument-functions instead if needed 12 | args += "-g -pthread -finstrument-functions-after-inlining".split() 13 | 14 | srcdir = os.path.dirname(os.path.dirname(__file__)) 15 | if linking: 16 | is_shared = '-shared' in args 17 | if not is_shared: # don't link the runtime into .so's - only into the executables using them 18 | args += [os.path.join(srcdir, f) for f in ['funtrace.cpp']] + [f'-Wl,--dynamic-list={srcdir}/funtrace.dyn'] 19 | else: 20 | # remove no-undefined for access to the funtrace_* runtime functions 21 | args = [a for a in args if a != '-Wl,--no-undefined'] 22 | args += ['-ldl'] 23 | 24 | funtracexx = os.path.join(srcdir, 'compiler-wrappers/funtrace++') 25 | os.execl(funtracexx, *([funtracexx, 'clang++'] + args)) 26 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024-2025, Yossi Kreinin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /tests/longjmp.cpp: -------------------------------------------------------------------------------- 1 | //we use longjmp just as an example of something that breaks the assumption 2 | //that you get a return-from-function event eventually after it was called - 3 | //instead you have here a bunch of functions that are called and never returned 4 | //from. we use this to test the ability of funtrace2viz to (somewhat) recover from such 5 | //scenarios, of which the use of longjmp is one potential cause [which we could 6 | //try to eliminate by interposing longjmp but it doesn't seem popular enough to 7 | //bother and there are probably others] 8 | 9 | #include 10 | #include "test.h" 11 | 12 | volatile int n; 13 | jmp_buf jmpbuf; 14 | 15 | void NI jumper() 16 | { 17 | n++; 18 | longjmp(jmpbuf, 1); 19 | } 20 | 21 | void NI wrapper_call() 22 | { 23 | n++; 24 | jumper(); 25 | n++; 26 | } 27 | 28 | void NI wrapper_call_outer() 29 | { 30 | n++; 31 | wrapper_call(); 32 | n++; 33 | } 34 | 35 | void NI before_setjmp() 36 | { 37 | n++; 38 | } 39 | 40 | void NI after_longjmp() 41 | { 42 | n++; 43 | } 44 | 45 | void NI setter() 46 | { 47 | n++; 48 | before_setjmp(); 49 | if(setjmp(jmpbuf)) { 50 | after_longjmp(); 51 | } 52 | else { 53 | wrapper_call_outer(); 54 | } 55 | } 56 | 57 | int main() 58 | { 59 | scope_tracer tracer; 60 | for(int i=0; i<3; ++i) { 61 | setter(); 62 | } 63 | } 64 | 65 | -------------------------------------------------------------------------------- /compiler-wrappers/funtrace-xray-clang++: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import sys 3 | import os 4 | import subprocess 5 | linking = '-c' not in sys.argv and '-S' not in sys.argv and '-E' not in sys.argv 6 | args = sys.argv[1:] 7 | 8 | # we use the default instruction threshold, pass -fxray-instruction-threshold=N to override 9 | args += '-g -pthread -fxray-instrument'.split() 10 | if linking: 11 | srcdir = os.path.dirname(os.path.dirname(__file__)) 12 | args += [f'-B{srcdir}/compiler-wrappers/xray'] # use our ld wrapper to override __xray_Function* handlers 13 | 14 | is_shared = '-shared' in args 15 | if not is_shared: # don't link the runtime into .so's - only into the executables using them 16 | args += [os.path.join(srcdir, f) for f in ['funtrace.cpp', 'funtrace_pg.S']] + [f'-Wl,--dynamic-list={srcdir}/funtrace.dyn'] 17 | args += ['-ldl'] 18 | else: 19 | args += ['-fxray-shared','fun_xray_so.S'] # this requires a pretty new version of LLVM; older ones can't instrument inside shared objects 20 | # remove no-undefined for access to the funtrace_* runtime functions 21 | args = [a for a in args if a != '-Wl,--no-undefined'] 22 | 23 | # currently funtrace++ doesn't support filtering of XRay compiler output so we run clang++ directly 24 | clangpath = subprocess.getoutput('which clang++') 25 | os.execl(clangpath, *([clangpath] + [arg for arg in args if not arg.startswith('-funtrace')])) 26 | -------------------------------------------------------------------------------- /compiler-wrappers/funtrace-pg-g++: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import sys 3 | import os 4 | import subprocess 5 | linking = '-c' not in sys.argv and '-S' not in sys.argv and '-E' not in sys.argv 6 | args = sys.argv[1:] 7 | 8 | args += '-g -pthread'.split() 9 | srcdir = os.path.dirname(os.path.dirname(__file__)) 10 | if linking: 11 | is_shared = '-shared' in args 12 | if not is_shared: # don't link the runtime into .so's - only into the executables using them 13 | args += [os.path.join(srcdir, f) for f in ['funtrace.cpp', 'funtrace_pg.S']] + [f'-Wl,--dynamic-list={srcdir}/funtrace.dyn'] 14 | else: 15 | # remove no-undefined - __return__ will be undefined in shared objects and so will the funtrace_* runtime functions 16 | args = [a for a in args if a != '-Wl,--no-undefined'] 17 | args += ['-ldl'] 18 | # note that we don't pass -pg when linking (and therefore the -mfentry and -minstrument-return-call 19 | # flags which do nothing without -pg.) this is to avoid the generation of gmon.out. the downside 20 | # is that if .cpp files are passed to the linker (so compiling and linking in a single command), 21 | # we won't instrument those files. proper build system integration is ofc better than this wrapper... 22 | else: 23 | args += '-pg -mfentry -minstrument-return=call'.split() 24 | 25 | funtracexx = os.path.join(srcdir, 'compiler-wrappers/funtrace++') 26 | os.execl(funtracexx, *([funtracexx, 'g++'] + args)) 27 | -------------------------------------------------------------------------------- /compiler-wrappers/funtrace-finstr-g++: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import sys 3 | import os 4 | import subprocess 5 | linking = '-c' not in sys.argv and '-S' not in sys.argv and '-E' not in sys.argv 6 | args = sys.argv[1:] 7 | 8 | # change -finstrument-functions-exclude-file-list to suit your needs - this is just a sensible default 9 | # (ofc you don't need to use a compiler wrapper at all; this is just potentially easier than properly 10 | # integrating with the build system on first try. more so with -pg than with -finstrument-functions 11 | # because the latter will have problems with an "undefined __return__" function every time gcc is invoked 12 | # with -minstrument-return=call but without funtrace_pg.S...) 13 | args += "-g -pthread -finstrument-functions -finstrument-functions-exclude-file-list=.h,.hpp,/usr/include".split() 14 | 15 | srcdir = os.path.dirname(os.path.dirname(__file__)) 16 | if linking: 17 | is_shared = '-shared' in args 18 | if not is_shared: # don't link the runtime into .so's - only into the executables using them 19 | args += [os.path.join(srcdir, f) for f in ['funtrace.cpp']] + [f'-Wl,--dynamic-list={srcdir}/funtrace.dyn'] 20 | else: 21 | # remove no-undefined for access to the funtrace_* runtime functions 22 | args = [a for a in args if a != '-Wl,--no-undefined'] 23 | args += ['-ldl'] 24 | 25 | funtracexx = os.path.join(srcdir, 'compiler-wrappers/funtrace++') 26 | os.execl(funtracexx, *([funtracexx, 'g++'] + args)) 27 | -------------------------------------------------------------------------------- /funtrace_flags.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | //these definitions must be kept in sync with funtrace2viz's 4 | #define FUNTRACE_RETURN_BIT 63 //normally, a return event logs the address of the returning function... 5 | #define FUNTRACE_RETURN_WITH_CALLER_ADDRESS_BIT 62 //...except under XRay when it logs the returning function's caller's address 6 | #define FUNTRACE_CATCH_MASK ((1ULL< 3 | 4 | volatile int n; 5 | 6 | extern "C" void NI trace_filtered() 7 | { 8 | n=0; 9 | } 10 | 11 | void NI NOFUNTRACE notrace() 12 | { 13 | n=0; 14 | } 15 | 16 | void NI withtrace() 17 | { 18 | n=0; 19 | } 20 | 21 | const int iter=1000000; 22 | 23 | template 24 | inline uint64_t time(F f, const char* msg, uint64_t base=0) 25 | { 26 | int n=(iter/8)*8; 27 | auto start = funtrace_time(); 28 | for(int i=0; i {{ 8 | println!($($arg)*); 9 | std::process::exit(1); 10 | }}; 11 | } 12 | 13 | fn main() { 14 | let args: Vec = env::args().collect(); 15 | if args.len() != 2 { 16 | fail!("Usage: {} # counts with function names printed to stdout, pipe through c++filt if you want to demangle the symbols", args[0]); 17 | } 18 | 19 | // Open the input file 20 | let file = File::open(args[1].to_string()).expect("failed to open input file"); 21 | let mut reader = BufReader::new(file); 22 | 23 | // Validate and parse the magic strings 24 | let mut line = String::new(); 25 | reader.read_line(&mut line).expect("failed to read FUNCOUNT"); 26 | if line.trim() != "FUNCOUNT" { fail!("missing FUNCOUNT magic string - got `{}'", line); } 27 | line.clear(); 28 | 29 | reader.read_line(&mut line).expect("failed to read PROCMAPS"); 30 | if line.trim() != "PROCMAPS" { fail!("missing PROCMAPS magic string - got `{}'", line); } 31 | line.clear(); 32 | 33 | // Read and parse the memory maps 34 | let mut proc_maps_data = String::new(); 35 | let mut found = false; 36 | while reader.read_line(&mut line).expect("failure reading input file") > 0 { 37 | if line.trim() == "COUNTS" { 38 | found = true; 39 | break; 40 | } 41 | proc_maps_data.push_str(&line); 42 | line.clear(); 43 | } 44 | if !found { fail!("COUNTS magic string not found"); } 45 | line.clear(); 46 | 47 | let input_source = Some(procaddr2sym::input_source(args[1].to_string())); 48 | let mut procaddr2sym = ProcAddr2Sym::new(); 49 | procaddr2sym.input_source = input_source; 50 | procaddr2sym.set_proc_maps(proc_maps_data.as_bytes()); 51 | 52 | while reader.read_line(&mut line).expect("failure reading input file") > 0 { 53 | let parts: Vec<&str> = line.trim().split_whitespace().collect(); 54 | if parts.len() != 2 { fail!("Invalid address-count pair {}", line); } 55 | 56 | let address = u64::from_str_radix(parts[0].trim_start_matches("0x"), 16).expect("bad address"); 57 | let count = parts[1].parse::().expect("bad count"); 58 | 59 | let syminfo = procaddr2sym.proc_addr2sym(address); 60 | 61 | println!("{} {:#x} {:#x} {} {} {}:{} {}", count, address, syminfo.static_addr, syminfo.size, syminfo.executable_file, syminfo.file, syminfo.line, syminfo.func); 62 | 63 | line.clear(); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /funtrace.h: -------------------------------------------------------------------------------- 1 | /* 2 | * there are 2 ways to save the trace: 3 | * 4 | * - by getting & saving aside trace data upon interesting events of your choice, 5 | * and eventually writing them out at the time of your choosing. this is good, 6 | * for instance, for keeping a trace corresponding to the slowest observed 7 | * handling of every kind of event (so you throw out this trace and replace it 8 | * with a new one every time you observe an even slower event), and writing 9 | * it all out upon request or when the program terminates. 10 | * 11 | * - by writing to the funtrace.raw file (which is only opened if you call 12 | * funtrace_pause_and_write_current_snapshot() or use `kill -SIGTRAP` on 13 | * the process). this is good if you detect moments of peak 14 | * load and want to write the data out immediately, without wasting memory 15 | * for keeping the trace data beyond the cyclic buffers already allocated 16 | * to collect the trace in the first place (the data is written out 17 | * from these buffers while collecting new trace data is paused - same 18 | * as it's paused when data is saved aside for writing out later, but 19 | * for a longer period of time.) the downside is that you can't "unwrite" 20 | * the trace data, and you don't choose when to handle the writing but 21 | * rather have it occur immediately after deciding to save the trace. 22 | */ 23 | #pragma once 24 | 25 | #include 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | /* to "just append the current trace snapshot to funtrace.raw", all you need 32 | is this function (this is also what SIGTRAP does unless you compile with 33 | -DFUNTRACE_NO_SIGTRAP) 34 | 35 | threads cannot be created, and their termination is delayed until the data 36 | is fully written out 37 | 38 | note that if a shared object was unloaded during the time range in the snapshot 39 | (thankfully not a very common scenario), function calls traced from this shared 40 | object will not be possible to decode to symbolic function names (this is true 41 | for all the functions taking snapshots below) 42 | */ 43 | void funtrace_pause_and_write_current_snapshot(); 44 | 45 | /* these methods are for saving trace data snapshots, and then 46 | writing them out at the time of your choosing. */ 47 | 48 | struct funtrace_snapshot; 49 | 50 | /* a snapshot has the size FUNTRACE_BUF_SIZE times the number of threads alive 51 | at the time when it's taken. threads can't be created and can't terminate 52 | until the trace data is copied into the snapshot */ 53 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot(); 54 | 55 | /* you might also want to only get the data up to a certain age, 56 | both to save time & space and to get "the part you want" (like from the 57 | start of handling some event till the end) */ 58 | uint64_t funtrace_time(); /* timestamp from the same source used for tracing */ 59 | uint64_t funtrace_ticks_per_second(); /* funtrace_time()/funtrace_ticks_per_second() converts time to seconds */ 60 | 61 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot_starting_at_time(uint64_t time); 62 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot_up_to_age(uint64_t max_event_age); 63 | void funtrace_free_snapshot(struct funtrace_snapshot* snapshot); /* nop if NULL is passed */ 64 | 65 | /* writing out a sample into its own file after it was obtained with funtrace_pause_and_get_snapshot() 66 | does not interfere with threads starting and terminating. TODO: we could add a version with 67 | a "write_data" callback instead of a filename given demand */ 68 | void funtrace_write_snapshot(const char* filename, struct funtrace_snapshot* snapshot); 69 | 70 | /* this is useful to save memory for the event buffer in threads you don't want to trace, 71 | and also to save some but not all of the function call overhead due to being compiled 72 | with tracing enabled */ 73 | void funtrace_ignore_this_thread(); 74 | 75 | /* set this thread's buffer size (must be a power of 2, so defined by a log value, 76 | which must be larger the log of the size of 2 events; 77 | using a smaller value is equivalent to callung funtrace_ignore_this_thread()). */ 78 | void funtrace_set_thread_log_buf_size(int log_buf_size); 79 | 80 | /* disabling tracing will speed things up slightly. note that we don't 81 | free the buffers when disabling tracing and don't reallocate them 82 | when enabling tracing. funtrace_ignore_this_thread() is how you free 83 | the buffer of a thread. */ 84 | void funtrace_disable_tracing(); 85 | void funtrace_enable_tracing(); 86 | 87 | #ifdef __clang__ 88 | #define NOFUNTRACE __attribute__((xray_never_instrument)) __attribute__((no_instrument_function)) 89 | #define DOFUNTRACE __attribute__((xray_always_instrument)) 90 | #else 91 | #define NOFUNTRACE __attribute__((no_instrument_function)) 92 | #define DOFUNTRACE /* gcc doesn't have an attribute to force instrumentation */ 93 | #endif 94 | 95 | #ifdef __cplusplus 96 | } 97 | #endif 98 | -------------------------------------------------------------------------------- /funtrace_gdb.py: -------------------------------------------------------------------------------- 1 | # `info proc mappings` format: 2 | # Start Addr End Addr Size Offset objfile 3 | # 0x555555554000 0x555555556000 0x2000 0x0 /path/to/file 4 | # 5 | # /proc/self/maps format - start-end permissions offset device inode /path/to/file 6 | # 7f74a4ae6000-7f74a4b08000 r--p 00000000 103:07 109578392 /usr/lib/x86_64-linux-gnu/libc-2.31.so 7 | 8 | import gdb, struct, traceback 9 | 10 | def write_chunk(f, magic, content): 11 | assert len(magic)==8 12 | f.write(magic) 13 | f.write(struct.pack('Q', len(content))) 14 | f.write(content) 15 | 16 | def write_proc_maps(f): 17 | mappings = gdb.execute('info proc mappings', from_tty=False, to_string=True) 18 | 19 | proc_maps = b'' 20 | for line in mappings.strip().split('\n'): 21 | line = line.strip() 22 | if line.startswith('0x'): 23 | t = line.split() 24 | if len(t) == 5: # we don't care about unnamed segments 25 | start, end, size, offset, path = line.split() 26 | # we don't care about permissions, device and inode 27 | proc_maps += b'%10x-%10x r-xp %08x 0:0 0 %s\n'%(int(start,16), int(end,16), int(offset,16), bytes(path,encoding='utf-8')) 28 | 29 | print('funtrace: saving proc mappings') 30 | write_chunk(f, b'PROCMAPS', proc_maps) 31 | 32 | def get_vector_elements(v): 33 | vis = gdb.default_visualizer(v) 34 | if vis: 35 | return [elem for _,elem in list(vis.children())] 36 | else: # no pretty printers - assume we know the representation 37 | start = v['_M_impl']['_M_start'] 38 | finish = v['_M_impl']['_M_finish'] 39 | 40 | return [start[i] for i in range(finish-start)] 41 | 42 | def get_string(s): 43 | vis = gdb.default_visualizer(s) 44 | if False and vis: # the False comments it out since we get garbage like "" with this at times, 45 | # and I haven't found a way to avoid this 46 | return str(vis.to_string().value())[1:-1] 47 | else: # rely on the representation 48 | length = int(s['_M_string_length']) 49 | data_ptr = s['_M_dataplus']['_M_p'] 50 | 51 | return str(gdb.selected_inferior().read_memory(data_ptr, length), encoding='utf-8') 52 | return str(gdb.Value(data_ptr.cast(gdb.lookup_type('char').pointer()).string(length=length)))[1:-1] 53 | 54 | def write_ftrace(f): 55 | try: 56 | handler = gdb.parse_and_eval('g_ftrace_handler') 57 | except: 58 | print("funtrace: compiled without ftrace support - can't fetch last ftrace events") 59 | return 60 | 61 | events = get_vector_elements(handler['events']) 62 | pos = int(handler['pos']) 63 | lines = [] 64 | def collect_lines(start, end): 65 | lines.extend([get_string(events[i]['line']) for i in range(start, end) if int(events[i]['timestamp'])]) 66 | collect_lines(pos, len(events)) # these are the older ones 67 | collect_lines(0, pos) 68 | 69 | print(f'funtrace: saving {len(lines)} ftrace events') 70 | write_chunk(f, b'FTRACETX', ('\n'.join(lines) + '\n').encode('utf-8')) 71 | 72 | def write_funtrace(f): 73 | p_trace_state = gdb.parse_and_eval('g_p_trace_state') 74 | if p_trace_state == gdb.Value(0).cast(p_trace_state.type): 75 | print('funtrace not initialized yet - no trace data') 76 | return 77 | trace_state = p_trace_state.dereference() 78 | 79 | write_chunk(f, b'FUNTRACE', struct.pack('Q', int(trace_state['cpu_freq']))) 80 | 81 | cmdline = get_string(trace_state['cmdline']) 82 | print(f'funtrace: core dump generated by `{cmdline}`') 83 | write_chunk(f, b'CMD LINE', cmdline.encode('utf-8')) 84 | 85 | thread_traces = get_vector_elements(trace_state['thread_traces']) 86 | for i, trace in enumerate(thread_traces): 87 | # separately add ftrace 88 | trace = trace.dereference() 89 | thread_id = trace['id'] 90 | buf_size = trace['buf_size'] 91 | write_chunk(f, b'THREADID', bytes(gdb.selected_inferior().read_memory(thread_id.address, thread_id.type.sizeof))) 92 | 93 | buf = trace['buf'] 94 | data = bytes(gdb.selected_inferior().read_memory(buf, buf_size)) 95 | name = thread_id["name"].string() 96 | print(f'funtrace: thread {thread_id["tid"]} {name} - saving {buf_size} bytes of data read from {buf}') 97 | write_chunk(f, b'TRACEBUF', data) 98 | 99 | write_ftrace(f) 100 | 101 | write_chunk(f, b'ENDTRACE', b'') 102 | print('funtrace: done - decode with `funtrace2viz funtrace.raw out` and then view in viztracer (pip install viztracer) with `vizviewer out.json`') 103 | 104 | class FuntraceCmd(gdb.Command): 105 | '''prints the content of the funtrace event buffers into ./funtrace.raw 106 | 107 | you can then decode that file using funtrace2viz and open the output JSON files with vizviewer 108 | (installed by `pip install viztracer`) or Perfetto (https://ui.perfetto.dev, click 109 | "Open with legacy UI" - no source access unlike in vizviewer but otherwise should work)''' 110 | 111 | def __init__(self): 112 | super(FuntraceCmd, self).__init__("funtrace", gdb.COMMAND_DATA) 113 | 114 | def invoke(self, arg, from_tty): 115 | try: 116 | with open('funtrace.raw', 'wb') as f: 117 | write_proc_maps(f) 118 | write_funtrace(f) 119 | except: 120 | traceback.print_exc() 121 | raise 122 | 123 | FuntraceCmd() 124 | -------------------------------------------------------------------------------- /funtrace_pg.S: -------------------------------------------------------------------------------- 1 | #ifdef FUNTRACE_FUNCOUNT 2 | #include "funcount_pg.S" 3 | #else 4 | 5 | #include "funtrace_flags.h" 6 | 7 | .p2align 4 8 | .globl __fentry__ 9 | .type __fentry__, @function 10 | .globl __xray_FunctionEntry 11 | .type __xray_FunctionEntry, @function 12 | .globl exe_xray_FunctionEntry 13 | .type exe_xray_FunctionEntry, @function 14 | __fentry__: 15 | __xray_FunctionEntry: 16 | exe_xray_FunctionEntry: 17 | .cfi_startproc 18 | // r11 = g_thread_trace.pos 19 | movq %fs:g_thread_trace@tpoff, %r11 20 | // cyclic buffer wraparound - clear the FUNTRACE_LOG_BUF_SIZE bit in pos 21 | andq %fs:8+g_thread_trace@tpoff, %r11 22 | // if(!g_thread_trace.wraparound_mask) return 23 | je .early_exit_from_fentry 24 | 25 | // r10 = __builtin_return_address(0) 26 | movq (%rsp), %r10 27 | // rdtsc clobbers rdx which might have been used for a caller's parameter - save 28 | pushq %rdx 29 | 30 | // rax = __rdtsc() 31 | rdtsc 32 | salq $32, %rdx 33 | orq %rdx, %rax 34 | // pos->func = return_address 35 | movq %r10, (%r11) 36 | // pos++ 37 | addq $16, %r11 38 | // pos->cycle = rdtsc (the pos _before_ the increment; gcc generated this code...) 39 | movq %rax, -8(%r11) 40 | // save pos back to g_thread_trace.pos 41 | movq %r11, %fs:g_thread_trace@tpoff 42 | 43 | popq %rdx 44 | .early_exit_from_fentry: 45 | ret 46 | 47 | // XRay instrumentation (unlike __fentry__/__return__ -pg instrumentation) calls separate 48 | // functions upon returning from a function and upon tail-calling a function instead of 49 | // returning from its caller. you would think this lets us present the tail call correctly (instead 50 | // of, given f which calls g which tail-calls h, misprepresent h as being called from f, 51 | // which we end up doing under -pg instrumentation because __return__ is called by g 52 | // before jumping to h) 53 | // 54 | // however, in practice, it's not clear how to make use of the tail-call vs return distinction. 55 | // for example, you push the tail-caller to the stack and pop it when its callee returns; this 56 | // works well if this tail-callee is instrumented or calls at least one instrumented function 57 | // itself, but what if it doesn't - when is the tail-caller going to be "diagnosed" as having 58 | // returned in that case? 59 | // 60 | // XRay itself records distinct event types, EXIT and TAIL_EXIT, and then xray-converter.cpp 61 | // treats them exactly the same in exportAsChromeTraceEventFormat(). we simply record a single 62 | // event type for both events 63 | .cfi_endproc 64 | .size __fentry__, .-__fentry__ 65 | .size __xray_FunctionEntry, .-__xray_FunctionEntry 66 | .size exe_xray_FunctionEntry, .-exe_xray_FunctionEntry 67 | 68 | 69 | .p2align 4 70 | .globl __return__ 71 | .type __return__, @function 72 | .globl __xray_FunctionExit 73 | .type __xray_FunctionExit, @function 74 | .globl exe_xray_FunctionExit 75 | .type exe_xray_FunctionExit, @function 76 | .globl __xray_FunctionTailExit 77 | .type __xray_FunctionTailExit, @function 78 | .globl exe_xray_FunctionTailExit 79 | .type exe_xray_FunctionTailExit, @function 80 | __return__: 81 | __xray_FunctionExit: 82 | exe_xray_FunctionExit: 83 | __xray_FunctionTailExit: 84 | exe_xray_FunctionTailExit: 85 | .cfi_startproc 86 | 87 | movq %fs:g_thread_trace@tpoff, %r11 88 | andq %fs:8+g_thread_trace@tpoff, %r11 89 | je .early_exit_from_return 90 | 91 | movq (%rsp), %r10 92 | 93 | //rdtsc clobbers both of these; __return__ can't clobber rax 94 | //(unlike __fentry__ which can.) note that the opposite isn't true - 95 | //__return__ can't clobber rdx "symmetrically" to __fentry__'s clobbering 96 | //of rax, because a tail call can happen after the call to __return__ 97 | //(not sure why gcc does it this way but it does) and this tail call 98 | //might get an argument in rdx 99 | pushq %rdx 100 | pushq %rax 101 | 102 | rdtsc 103 | salq $32, %rdx 104 | orq %rdx, %rax 105 | //this is the main addition in __return__ to the code of __fentry__ 106 | #ifndef __clang__ 107 | btsq $FUNTRACE_RETURN_BIT, %r10 108 | #else 109 | btsq $FUNTRACE_RETURN_WITH_CALLER_ADDRESS_BIT, %r10 //XRay jumps to the exit handler 110 | //rather than calling it; you get an integer ID of the returning function in a register 111 | //but we don't use it (it's not that trivial to decode, and XRay itself still doesn't 112 | //do it for functions in shared objects as of early 2025). our return address thus 113 | //points into the caller of the returning function 114 | #endif 115 | movq %rax, 8(%r11) 116 | addq $16, %r11 117 | movq %r10, -16(%r11) 118 | movq %r11, %fs:g_thread_trace@tpoff 119 | 120 | popq %rax 121 | popq %rdx 122 | .early_exit_from_return: 123 | ret 124 | 125 | .cfi_endproc 126 | .size __return__, .-__return__ 127 | .size __xray_FunctionExit, .-__xray_FunctionExit 128 | .size exe_xray_FunctionExit, .-exe_xray_FunctionExit 129 | .size __xray_FunctionTailExit, .-__xray_FunctionTailExit 130 | .size exe_xray_FunctionTailExit, .-exe_xray_FunctionTailExit 131 | #endif 132 | -------------------------------------------------------------------------------- /compiler-wrappers/funtrace++: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | ''' 3 | usage: funtrace++ [-funtrace-instr-thresh=N] [-funtrace-no-trace=] [-funtrace-do-trace=] [-funtrace-ignore-loops] 4 | ''' 5 | import os, sys, subprocess 6 | 7 | class TraceSupressor: 8 | def __init__(self, args): 9 | self.instr_thresh = 0 10 | self.ignore_loops = False 11 | self.no_trace = [] 12 | self.do_trace = [] 13 | self.verbose = False 14 | def val(arg): return arg.split('=')[-1] 15 | def lines(file): return file, open(file).read().strip().split() 16 | for arg in args: 17 | if arg.startswith('-funtrace-instr-thresh='): 18 | self.instr_thresh = int(val(arg)) 19 | elif arg.startswith('-funtrace-no-trace='): 20 | self.no_file, self.no_trace = lines(val(arg)) 21 | elif arg.startswith('-funtrace-do-trace='): 22 | self.do_file, self.do_trace = lines(val(arg)) 23 | elif arg == '-funtrace-ignore-loops': 24 | self.ignore_loops = True 25 | elif arg == '-funtrace-verbose': 26 | self.verbose = True 27 | 28 | def suppress(self, funcname, num_instr, loops): 29 | if funcname in self.do_trace: 30 | if self.verbose: 31 | print(f'{funcname} listed in the do-trace file {self.do_file}') 32 | return 33 | reason = None 34 | if funcname in self.no_trace: 35 | reason = f'{funcname} listed in the trace suppression file {self.no_file}' 36 | elif num_instr < self.instr_thresh: 37 | if not loops: 38 | reason = f'{funcname} has {num_instr} instructions, less than -funtrace-instr-thresh={self.instr_thresh}' 39 | elif self.ignore_loops: 40 | reason = f'{funcname} has {num_instr} instructions, less than -funtrace-instr-thresh={self.instr_thresh}; it has {loops} loops but -funtrace-ignore-loops was passed' 41 | if self.verbose and reason: 42 | print(reason) 43 | return reason 44 | 45 | # note that we don't support filtering XRay's output, in part on the theory that it already has 46 | # -fxray-instruction-threshold=N; to support it, we'd need to look for the NOPs it inserts - it doesn't 47 | # put in call instructions, that's done by runtime code patching 48 | hooks = [ 49 | '__cyg_profile_func_enter', 50 | '__cyg_profile_func_exit', 51 | '__fentry__', 52 | '__return__', 53 | ] 54 | 55 | def filter_asm(asm_file, suppressor): 56 | with open(asm_file) as f: 57 | lines = f.read().split('\n') 58 | 59 | funcname = None 60 | infunc = False 61 | instrs = 0 62 | loops = 0 63 | labels = [] 64 | funcstart = None 65 | 66 | changed = False 67 | 68 | for i,line in enumerate(lines): 69 | l = line.strip() 70 | if l.startswith('.type') and l.endswith('@function'): 71 | funcname = l.split(',')[0].split()[-1] 72 | elif l == '.cfi_startproc': 73 | #print('in func',funcname) 74 | infunc = True 75 | funcstart = i+1 76 | instrs = 0 77 | loops = 0 78 | labels = [] 79 | elif l == '.cfi_endproc': 80 | #print('end func', funcname, instrs, loops) 81 | infunc = False 82 | suppression_reason = suppressor.suppress(funcname, instrs, loops) 83 | if not suppression_reason: 84 | continue 85 | 86 | for j in range(funcstart, i): 87 | l = lines[j].strip() 88 | for hook in hooks: 89 | if 'call' in l or 'jmp' in l: 90 | if hook in l: 91 | lines[j] = '# ' + lines[j] + ' # ' + suppression_reason 92 | if 'jmp' in l: # tail call 93 | lines[j] = ' ret ' + lines[j] 94 | changed = True 95 | break 96 | elif infunc: 97 | if not l: 98 | continue 99 | t = l.split()[0] 100 | isinstr = line[0].isspace() and not t.startswith('.') and not t.endswith(':') 101 | if isinstr: 102 | instrs += 1 103 | for label in labels: 104 | if label in l: 105 | loops += 1 106 | break 107 | elif t.startswith('.') and t.endswith(':'): 108 | labels.append(t[:-1]) 109 | 110 | if changed: 111 | with open(asm_file, 'w') as f: 112 | f.write('\n'.join(lines)) 113 | 114 | def exec_compiler(cmd, execl=True): 115 | compiler = cmd[0] 116 | if not os.path.exists(compiler): 117 | compiler = subprocess.getoutput(f'which {compiler}') 118 | if execl: 119 | os.execl(compiler, *cmd) 120 | else: 121 | subprocess.run([compiler]+cmd[1:]) 122 | 123 | def compile_filter_and_assemble(cmd, funtrace_args): 124 | suppressor = TraceSupressor(funtrace_args) 125 | compile_to_asm_cmd, assemble_cmd, asm_file = compile_and_assemble_commands(cmd) 126 | 127 | #print(' '.join(compile_to_asm_cmd)) 128 | exec_compiler(compile_to_asm_cmd, execl=False) 129 | 130 | filter_asm(asm_file, suppressor) 131 | 132 | #print(' '.join(assemble_cmd)) 133 | exec_compiler(assemble_cmd, execl=False) 134 | 135 | def compile_and_assemble_commands(cmd): 136 | ofile = None 137 | cfile = None 138 | sfile = None 139 | 140 | extensions = 'c cpp cc cxx cp CPP c++ C'.split() 141 | def is_src_arg(arg): 142 | if arg.startswith('-'): 143 | return False 144 | for ext in extensions: 145 | if arg.endswith(ext): 146 | return True 147 | 148 | for i,arg in enumerate(cmd): 149 | if arg == '-o' and i+1 < len(cmd): 150 | ofile = cmd[i+1] 151 | sfile = ofile+'.s' 152 | elif is_src_arg(arg): 153 | cfile = arg 154 | 155 | if cfile: 156 | if not ofile: 157 | ofile = cfile[:cfile.rfind('.')] + '.o' 158 | sfile = cfile[:cfile.rfind('.')] + '.s' 159 | compile_to_asm_cmd = [('-S' if arg == '-c' else arg) for arg in cmd] + ['-o',sfile] 160 | else: 161 | compile_to_asm_cmd = [('-S' if arg == '-c' else (sfile if arg == ofile else arg)) for arg in cmd] 162 | else: 163 | print(f'funtrace++ - WARNING: -c passed but could not determine the input source file in `{cmd}`') 164 | exec_compiler(cmd) 165 | 166 | assemble_cmd = [cmd[0], '-c', sfile, '-o', ofile] 167 | if 'clang' in cmd[0]: 168 | assemble_cmd += ['-Wa,-W'] # clang produces assembly using MD5 sums for some source files but not others and then the assembler warns of 169 | # "inconsistent use of md5 sums", not sure how to suppress this better... 170 | 171 | return compile_to_asm_cmd, assemble_cmd, sfile 172 | 173 | def main(): 174 | cmd = sys.argv[1:] 175 | funtrace_args = [arg for arg in cmd if arg.startswith('-funtrace-')] 176 | cmd = [arg for arg in cmd if not arg.startswith('-funtrace-')] 177 | if '-c' in cmd and '-E' not in cmd and '-dM' not in cmd and funtrace_args: 178 | compile_filter_and_assemble(cmd, funtrace_args) 179 | else: 180 | exec_compiler(cmd) 181 | 182 | if __name__ == '__main__': 183 | main() 184 | -------------------------------------------------------------------------------- /funcount.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifndef FUNCOUNT_PAGE_TABLES 11 | #define FUNCOUNT_PAGE_TABLES 1 12 | #endif 13 | 14 | #ifdef __clang__ 15 | #define NOINSTR __attribute__((xray_never_instrument)) __attribute__((no_instrument_function)) 16 | #else 17 | #define NOINSTR __attribute__((no_instrument_function)) 18 | #endif 19 | #define INLINE __attribute__((always_inline)) 20 | 21 | const int PAGE_BITS = 16; //works for a 2-level page table with 48b virtual addresses 22 | //which is OK for most userspace address spaces 23 | const int PAGE_SIZE = 1<> PAGE_BITS*2; 29 | //make sure bits higher than PAGE_BITS*3 are not set 30 | assert((bits & PAGE_BITS_MASK) == bits && "pointer has more than 48 bits set - try recompiling funcount.cpp with a larger PAGE_BITS constant"); 31 | return bits; 32 | } 33 | 34 | inline uint64_t NOINSTR mid_bits(uint64_t address) { return (address >> PAGE_BITS) & PAGE_BITS_MASK; } 35 | inline uint64_t NOINSTR low_bits(uint64_t address) { return address & PAGE_BITS_MASK; } 36 | 37 | //8-byte counts have the downside where very short functions are counted together; 38 | //4-byte counts would have been better for this but would be more likely to overflow 39 | typedef uint64_t count_t; 40 | 41 | struct CountsPage 42 | { 43 | std::atomic counts[PAGE_SIZE/sizeof(count_t)]; 44 | NOINSTR CountsPage() { memset(counts, 0, sizeof(counts)); } 45 | }; 46 | 47 | struct CountsPagesL1 48 | { 49 | CountsPage* pages[PAGE_SIZE]; 50 | NOINSTR CountsPagesL1() { memset(pages, 0, sizeof(pages)); } 51 | }; 52 | 53 | struct CountsPagesL2 54 | { 55 | CountsPagesL1* pagesL1[PAGE_SIZE]; 56 | //this counts function calls in executable segments not mapped at the time 57 | //when the code was running (allocate_range() wasn't called); AFAIK this 58 | //should be limited to constructors in shared objects (which get called 59 | //before we get a chance to call dl_iterate_phdr() to update our view 60 | //of the address space) 61 | // 62 | //note that these misses could be avoided by allocating the pages on demand 63 | //when a function is first called; however this slows things down even 64 | //if it's done in a non-thread-safe manner (potentially leaking pages and 65 | //losing call counts) and more so if it's done with in a thread-safe way 66 | //(we have a commit in the history doing this with 2 compare_exchange_strong() 67 | //calls.) for the purpose of finding the most commonly called functions 68 | //in order to exclude them from funtrace instrumentation, it's probably better 69 | //to limit the slowdown (s.t. interactive / real-time flows have some chance 70 | //of being usable for collecting statistics) at the expense of missing 71 | //constructor calls in dynamic libraries (which are very unlikely to be 72 | //where you badly need to suppress funtrace instrumentation because of 73 | //its overhead) 74 | std::atomic unknown; 75 | 76 | void NOINSTR init() 77 | { 78 | memset(pagesL1, 0, sizeof(pagesL1)); 79 | unknown = 0; 80 | } 81 | 82 | void NOINSTR allocate_range(uint64_t base, uint64_t size) 83 | { 84 | uint64_t start = base & ~PAGE_BITS_MASK; 85 | uint64_t end = (base + size + PAGE_SIZE - 1) & ~PAGE_BITS_MASK; 86 | for(uint64_t address=start; address<=end; address+=PAGE_SIZE) { 87 | auto high = high_bits(address); 88 | auto& pages = pagesL1[high]; 89 | if(!pages) { 90 | pages = new CountsPagesL1; 91 | } 92 | 93 | auto mid = mid_bits(address); 94 | auto& page = pages->pages[mid]; 95 | if(!page) { 96 | page = new CountsPage; 97 | } 98 | } 99 | } 100 | 101 | std::atomic& INLINE NOINSTR get_count(uint64_t address) 102 | { 103 | auto high = high_bits(address); 104 | auto pages = pagesL1[high]; 105 | if(!pages) { 106 | return unknown; 107 | } 108 | 109 | auto mid = mid_bits(address); 110 | auto page = pages->pages[mid]; 111 | if(!page) { 112 | return unknown; 113 | } 114 | 115 | auto low = low_bits(address); 116 | return page->counts[low / sizeof(count_t)]; 117 | } 118 | 119 | NOINSTR ~CountsPagesL2(); 120 | }; 121 | 122 | static CountsPagesL2 g_page_tab[FUNCOUNT_PAGE_TABLES]; 123 | 124 | static inline unsigned int INLINE NOINSTR core_num() 125 | { 126 | unsigned int aux; 127 | __rdtscp(&aux); 128 | return aux & 0xfff; 129 | } 130 | 131 | extern "C" void NOINSTR __cyg_profile_func_enter(void* func, void* caller) 132 | { 133 | static_assert(sizeof(count_t) == sizeof(std::atomic), "wrong size of atomic"); 134 | uint64_t addr = (uint64_t)func; 135 | int tab_ind = FUNCOUNT_PAGE_TABLES == 1 ? 0 : core_num() % FUNCOUNT_PAGE_TABLES; 136 | std::atomic& count = g_page_tab[tab_ind].get_count(addr); 137 | count += 1; 138 | } 139 | 140 | extern "C" void NOINSTR __cyg_profile_func_exit(void* func, void* caller) {} 141 | 142 | #include 143 | #include 144 | #include 145 | 146 | NOINSTR CountsPagesL2::~CountsPagesL2() 147 | { 148 | //the first object in the array is constructed first and destroyed last - 149 | auto last_page_tab = &g_page_tab[0]; 150 | 151 | std::ofstream out; 152 | if(this == last_page_tab) { 153 | out.open("funcount.txt"); 154 | out << "FUNCOUNT\nPROCMAPS\n"; 155 | std::ifstream maps_file("/proc/self/maps", std::ios::binary); 156 | if (!maps_file.is_open()) { 157 | std::cerr << "funtrace - failed to open /proc/self/maps, traces will be impossible to decode" << std::endl; 158 | return; 159 | } 160 | 161 | std::vector maps_data( 162 | (std::istreambuf_iterator(maps_file)), 163 | std::istreambuf_iterator()); 164 | 165 | maps_file.close(); 166 | out.write(&maps_data[0], maps_data.size()); 167 | out << "COUNTS\n"; 168 | } 169 | 170 | for(uint64_t hi=0; hipages[mid]; 175 | if(page) { 176 | for(uint64_t lo=0; locounts[lo]; 178 | if(count) { 179 | uint64_t address = (hi << PAGE_BITS*2) | (mid << PAGE_BITS) | (lo * sizeof(count_t)); 180 | if(this == last_page_tab) { 181 | //print the final counts 182 | out << std::hex << "0x" << address << ' ' << std::dec << count << '\n'; 183 | } 184 | else { 185 | //accumulate the results into the first page table 186 | last_page_tab->get_count(address) += count; 187 | } 188 | } 189 | } 190 | } 191 | pages->pages[mid] = nullptr; 192 | delete page; 193 | } 194 | pagesL1[hi] = nullptr; 195 | delete pages; 196 | } 197 | } 198 | if(unknown) { 199 | if(this == last_page_tab) { 200 | std::cout << "WARNING: " << unknown << " function calls were to functions in parts of the address space unknown at the time they were made (likely constructors in shared objects)" << std::endl; 201 | } 202 | else { 203 | last_page_tab->unknown += unknown; 204 | } 205 | } 206 | if(this == last_page_tab) { 207 | std::cout << "function call count report saved to funcount.txt - decode with funcount2sym to get: call_count, dyn_addr, static_addr, num_bytes, bin_file, src_file:src_line, mangled_func_name" << std::endl; 208 | } 209 | } 210 | 211 | static int NOINSTR phdr_callback (struct dl_phdr_info *info, size_t size, void *data) 212 | { 213 | for(int i=0; idlpi_phnum; ++i ) { 214 | const auto& phdr = info->dlpi_phdr[i]; 215 | if(phdr.p_type == PT_LOAD && (phdr.p_flags & PF_X)) { 216 | uint64_t start_addr = info->dlpi_addr + phdr.p_vaddr; 217 | for(int t=0; t) -> Option<&MemoryMap> { 22 | maps.binary_search_by(|map| { 23 | if address < map.address.0 { 24 | std::cmp::Ordering::Greater // Address is before this map 25 | } else if address >= map.address.1 { 26 | std::cmp::Ordering::Less // Address is after this map 27 | } else { 28 | std::cmp::Ordering::Equal // Address is within this map 29 | } 30 | }) 31 | .ok() 32 | .map(|index| &maps[index]) 33 | } 34 | 35 | struct Symbol { 36 | base_address: u64, 37 | size: u64, 38 | name: String, 39 | } 40 | 41 | fn read_elf_symbols(elf: &Elf) -> Vec { 42 | // Create a vector to store our symbols 43 | let mut symbols = Vec::new(); 44 | 45 | // Process dynamic symbols if they exist 46 | for sym in elf.dynsyms.iter() { 47 | // Get the symbol name from the dynamic string table 48 | if let Some(name) = elf.dynstrtab.get_at(sym.st_name) { 49 | symbols.push(Symbol { 50 | base_address: sym.st_value, 51 | size: sym.st_size, 52 | name: name.to_string(), 53 | }); 54 | } 55 | } 56 | 57 | // Process regular symbols if they exist 58 | for sym in elf.syms.iter() { 59 | // Get the symbol name from the string table 60 | if let Some(name) = elf.strtab.get_at(sym.st_name) { 61 | symbols.push(Symbol { 62 | base_address: sym.st_value, 63 | size: sym.st_size, 64 | name: name.to_string(), 65 | }); 66 | } 67 | } 68 | 69 | // Sort symbols by base address 70 | symbols.sort_by_key(|sym| sym.base_address); 71 | 72 | symbols 73 | } 74 | 75 | fn find_symbol(symbols: &Vec, address: u64) -> Option<&Symbol> { 76 | // Binary search for the largest base address that's <= our target address 77 | let idx = match symbols.binary_search_by_key(&address, |sym| sym.base_address) { 78 | Ok(exact) => exact, 79 | Err(insert_pos) => { 80 | if insert_pos == 0 { 81 | return None; 82 | } 83 | insert_pos - 1 84 | } 85 | }; 86 | 87 | // Get candidate symbol and check if address falls within its range 88 | let candidate = &symbols[idx]; 89 | if address >= candidate.base_address && address < candidate.base_address + candidate.size { 90 | Some(candidate) 91 | } else { 92 | None 93 | } 94 | } 95 | 96 | #[derive(Debug)] 97 | struct SubsPath { 98 | src: String, 99 | dst: String, 100 | } 101 | 102 | fn parse_substitute_path_json(file_name: &str) -> Vec { 103 | let mut file = match File::open(file_name) { 104 | Ok(file) => file, 105 | Err(_) => { 106 | return Vec::new(); 107 | } 108 | }; 109 | 110 | let mut json_str = String::new(); 111 | if let Err(e) = file.read_to_string(&mut json_str) { 112 | eprintln!("Warning: Failed to read from file '{}': {}", file_name, e); 113 | return Vec::new(); 114 | } 115 | 116 | let json_value: Value = match serde_json::from_str(&json_str) { 117 | Ok(value) => value, 118 | Err(e) => { 119 | eprintln!("Warning: Failed to parse JSON in file '{}': {}", file_name, e); 120 | return Vec::new(); 121 | } 122 | }; 123 | 124 | let mut subs_paths = Vec::new(); 125 | 126 | if let Some(array) = json_value.as_array() { 127 | for item in array { 128 | if let Some(inner_array) = item.as_array() { 129 | if inner_array.len() == 2 { 130 | if let (Some(src), Some(dst)) = (inner_array[0].as_str(), inner_array[1].as_str()) { 131 | subs_paths.push(SubsPath { 132 | src: src.to_string(), 133 | dst: dst.to_string(), 134 | }); 135 | } else { 136 | eprintln!("Warning: Invalid string pair in file '{}'", file_name); 137 | } 138 | } else { 139 | eprintln!("Warning: Array does not contain exactly 2 elements in file '{}'", file_name); 140 | } 141 | } else { 142 | eprintln!("Warning: Expected array in file '{}'", file_name); 143 | } 144 | } 145 | } else { 146 | eprintln!("Warning: Top level object is not an array in file '{}'", file_name); 147 | } 148 | 149 | subs_paths 150 | } 151 | 152 | struct ExecutableFileMetadata 153 | { 154 | program_headers: Vec, 155 | addr2line: Context>>, 156 | symbols: Vec, 157 | } 158 | 159 | pub struct InputSource { 160 | path: String, 161 | modified: SystemTime, 162 | } 163 | 164 | pub fn input_source(path: String) -> InputSource { 165 | InputSource { path: path.clone(), modified: fs::metadata(path).unwrap().modified().unwrap() } 166 | } 167 | 168 | pub struct ProcAddr2Sym { 169 | maps: Vec, 170 | sym_cache: HashMap, 171 | sym_missing: HashSet, 172 | offset_cache: HashMap, 173 | source_files: HashSet, //kept just to print "modified after the input source" warnings once per file 174 | subs_path: Vec, 175 | pub input_source: Option, 176 | } 177 | 178 | #[derive(Debug, Clone, Hash, PartialEq, std::cmp::Eq)] 179 | pub struct SymInfo { 180 | pub func: String, //before c++filt 181 | pub demangled_func: String, //after c++filt 182 | //note that these are, whenever possible, the file:line of the FIRST function 183 | //address, NOT the address passed to proc_addr2sym! 184 | //TODO: given demand we can provide a way to pass the file:line of the actual 185 | //address passed to proc_addr2sym 186 | pub file: String, //source file 187 | pub line: u32, //line number in the file 188 | pub executable_file: String, //executable or shared object 189 | pub static_addr: u64, //the address in the executable's symbol table 190 | //(without the dynamic offset to which it's loaded - this offset is subtracted 191 | //from the input address passed to proc_addr2sym()). like file:line, whenever 192 | //possible, this is the base address of the function, not the address 193 | //directly corresponding to the input dynamic address 194 | pub size: u64, //0 if no symbol found 195 | } 196 | 197 | fn time2str(time: &SystemTime) -> String { 198 | let datetime: DateTime = (*time).into(); 199 | datetime.format("%Y-%m-%d %H:%M:%S").to_string() 200 | } 201 | 202 | //sometimes you will see function names like "f(int) [clone .constprop.1]" 203 | //or "f(int) [clone .cold]", due to the compiler generating multiple copies of the code for various reasons. 204 | //we strip this "[clone .whatever]" stuff, not only because it's not too helpful for human users, 205 | //but because it actively interferes with eg exception handling (when throw/catch return us to "f() [clone .cold]" 206 | //we need to know that it's the same as the "f()" we have on our stack to be able to pop f's callees from the stack; 207 | //we don't want "[clone .cold]" to throw us off) 208 | fn strip_clone(input: String) -> String { 209 | if let Some(index) = input.find(" [clone ") { 210 | input[..index].to_string() 211 | } else { 212 | input 213 | } 214 | } 215 | 216 | impl ProcAddr2Sym { 217 | pub fn new() -> Self { 218 | ProcAddr2Sym { maps: Vec::new(), sym_cache: HashMap::new(), sym_missing: HashSet::new(), offset_cache: HashMap::new(), source_files: HashSet::new(), 219 | subs_path: parse_substitute_path_json("substitute-path.json"), input_source: None } 220 | } 221 | 222 | fn substitute_path(&self, path: String) -> String { 223 | let mut s = path; 224 | for subs in &self.subs_path { 225 | s = s.replace(&subs.src, &subs.dst); 226 | } 227 | s 228 | } 229 | 230 | // note that updating the maps doesn't invalidate sym_cache - we don't need to parse 231 | // the DWARF of the executables / shared objects again; but it does invalidate offset_cache 232 | // since the same shared object might have been loaded to a different offset 233 | pub fn set_proc_maps(&mut self, proc_maps_data: &[u8]) { 234 | let memory_maps = MemoryMaps::from_buf_read(proc_maps_data).expect("failed to parse /proc/self/maps data"); 235 | self.maps = memory_maps.into_iter().collect(); 236 | // not sure we need to sort them - /proc/self/maps appears already sorted - but can't hurt 237 | self.maps.sort_by_key(|map| map.address.0); 238 | self.offset_cache = HashMap::new(); 239 | } 240 | 241 | pub fn unknown_symbol(&self) -> SymInfo { 242 | return SymInfo { func: "??".to_string(), demangled_func: "??".to_string(), file: "??".to_string(), line: 0, executable_file: "??".to_string(), static_addr: 0, size: 0 }; 243 | } 244 | 245 | pub fn proc_addr2sym(&mut self, proc_address: u64) -> SymInfo { 246 | let unknown = self.unknown_symbol(); 247 | let map_opt = find_address_in_maps(proc_address, &self.maps); 248 | if map_opt == None { return unknown; } 249 | let map = map_opt.unwrap(); 250 | 251 | let path_opt = match &map.pathname { 252 | MMapPath::Path(p) => Some(p), 253 | _ => None, 254 | }; 255 | if path_opt == None { return unknown; } 256 | let path = path_opt.unwrap(); 257 | 258 | let pathstr = self.substitute_path(path.to_string_lossy().to_string()); 259 | if self.sym_missing.contains(&pathstr) { 260 | return unknown; 261 | } 262 | if !self.sym_cache.contains_key(&pathstr) { 263 | let fileopt = File::open(pathstr.clone()); 264 | if fileopt.is_err() { 265 | println!("WARNING: couldn't open executable file {} - you can remap paths using a substitute-path.json file in your working directory", pathstr); 266 | self.sym_missing.insert(pathstr); 267 | return unknown; 268 | } 269 | let file = fileopt.unwrap(); 270 | if let Some(ref input_source) = self.input_source { 271 | let modified = fs::metadata(pathstr.clone()).expect("failed to stat file").modified().expect("failed to get last modification timestamp"); 272 | if modified > input_source.modified { 273 | println!("WARNING: executable file {} last modified at {} - later than {} ({})", pathstr, time2str(&modified), input_source.path, time2str(&input_source.modified)); 274 | } 275 | } 276 | let buffer = unsafe { Mmap::map(&file).expect("failed to mmap executable file") }; 277 | let elf = Elf::parse(&buffer).expect("Failed to parse ELF"); 278 | let symbols = read_elf_symbols(&elf); 279 | let program_headers = elf.program_headers.clone(); 280 | let object = object::File::parse(&*buffer).expect("Failed to parse ELF"); 281 | let ctx = addr2line::Context::new(&object).expect("Failed to create addr2line context"); 282 | self.sym_cache.insert(pathstr.clone(), ExecutableFileMetadata { program_headers, addr2line: ctx, symbols }); 283 | } 284 | let meta = self.sym_cache.get(&pathstr).unwrap(); 285 | 286 | if !self.offset_cache.contains_key(&map.address.0) { 287 | //find the program header containing the file offset of this mapping 288 | let mut found = false; 289 | for phdr in meta.program_headers.iter() { 290 | if map.offset >= phdr.p_offset && map.offset < (phdr.p_offset + phdr.p_filesz) { 291 | let vaddr_offset = (map.offset - phdr.p_offset) + phdr.p_vaddr; 292 | self.offset_cache.insert(map.address.0, vaddr_offset); 293 | found = true; 294 | break; 295 | } 296 | } 297 | if !found { return unknown; } 298 | } 299 | let vaddr_offset = self.offset_cache.get(&map.address.0).unwrap(); 300 | let mut static_addr = proc_address - map.address.0 + vaddr_offset; 301 | let mut size = 0; 302 | 303 | let mut name = "??".to_string(); 304 | let mut demangled_func = "??".to_string(); 305 | let mut name_found = false; 306 | 307 | if let Some(sym) = find_symbol(&meta.symbols, static_addr) { 308 | name_found = true; 309 | name = sym.name.clone(); 310 | static_addr = sym.base_address; 311 | size = sym.size; 312 | if let Ok(demsym) = cpp_demangle::Symbol::new(name.clone()) { 313 | demangled_func = demsym.to_string(); 314 | } 315 | else { 316 | demangled_func = name.clone(); 317 | } 318 | } 319 | 320 | let (file, linenum) = match meta.addr2line.find_location(static_addr) { 321 | Ok(Some(location)) => (location.file.unwrap_or("??"), location.line.unwrap_or(0)), 322 | _ => ("??",0), 323 | }; 324 | let file = self.substitute_path(file.to_string()); 325 | if let Some(ref input_source) = self.input_source { 326 | let file = file.clone(); 327 | if !self.source_files.contains(&file) { 328 | //don't warn if we can't access the file (maybe the source code isn't supposed to be 329 | //on this machine or it's a relative path or whatever); do warn if we can access it and it's newer than the data 330 | //source - very likely a mistake the user should be aware of 331 | if let Ok(meta) = fs::metadata(file.clone()) { 332 | if let Ok(modified) = meta.modified() { 333 | if modified > input_source.modified { 334 | println!("WARNING: source file {} last modified at {} - later than {} ({})", file, time2str(&modified), input_source.path, time2str(&input_source.modified)); 335 | } 336 | } 337 | } 338 | self.source_files.insert(file); 339 | } 340 | } 341 | 342 | if !name_found { 343 | //not sure if we are ever going to meet a case where there's no ELF symbol name 344 | //but we do have DWARF debug info but can't hurt to try. 345 | // 346 | //there are at least 3 reasons not to use this code by itself, without bothering 347 | //with ELF symbol tables at all: 348 | // 349 | //* sometimes you have ELF symbols but no DWARF debug info 350 | //* some functions (such as "virtual" and "non-virtual" "thunks" auto-generated by gcc 351 | // have an ELF symbol but no debug info in DWARF (at least not function name info; 352 | // and incidentally we very much _need_ this info because such thunks have __return__ 353 | // without __fentry__ and we need to keep this from mauling the decoded trace) 354 | //* we want, at least in funtrace's context, to find file:line of the first function 355 | // address, which the ELF symbol readily makes available 356 | // 357 | //but it seems harmless to keep this code as fallback just in case 358 | //(in any case we use addr2line for the file:line info so "the object is already there".) 359 | if let Ok(frames) = meta.addr2line.find_frames(static_addr).skip_all_loads() { 360 | if let Ok(Some(frame)) = frames.last() { 361 | if let Some(funref) = frame.function.as_ref() { 362 | if let Ok(fname) = funref.raw_name() { 363 | name = fname.to_string(); 364 | demangled_func = name.clone(); 365 | } 366 | if let Ok(dname) = funref.demangle() { 367 | demangled_func = dname.to_string(); 368 | } 369 | } 370 | } 371 | } 372 | } 373 | SymInfo{func:strip_clone(name), demangled_func:strip_clone(demangled_func), file, line:linenum, executable_file:pathstr, static_addr, size} 374 | } 375 | } 376 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import json 3 | import os 4 | import glob 5 | from multiprocessing import Pool 6 | 7 | call='+' 8 | ret='-' 9 | 10 | def parse_perfetto_json(fname): 11 | with open(fname) as f: 12 | data = json.load(f) 13 | events = data['traceEvents'] 14 | threads = {} 15 | thread_names = {} 16 | thread2timestamps = {} 17 | for event in events: 18 | phase = event['ph'] 19 | tid = event['tid'] 20 | name = event['name'] 21 | if 'std::thread::_Invoker' in name or 'std::thread::thread' in name: # we use std::thread in tests - ignore the noise it adds to traces 22 | continue # 23 | if phase == 'M': # metadata 24 | if name == 'thread_name': 25 | thread_names[tid] = event['args']['name'] 26 | if thread_names[tid] in threads: # not a unique name 27 | thread_names[tid] += '.%d'%tid # mangle by tid 28 | continue 29 | assert phase == 'X' # complete event 30 | timepoints = threads.setdefault(thread_names[tid], list()) 31 | timestamp = event['ts'] 32 | duration = event['dur'] 33 | 34 | timestamps = thread2timestamps.setdefault(tid, dict()) 35 | 36 | assert timestamp not in timestamps, f'expecting unique timestamps in every thread! 2 events with the same timestamp: call of {event}; {timestamps[timestamp]}' 37 | assert timestamp+duration not in timestamps, f'expecting unique timestamps in every thread! 2 events with the same timestamp: return of {event}; {timestamps[timestamp+duration]}' 38 | timestamps[timestamp] = ('call',event) 39 | timestamps[timestamp+duration] = ('ret',event) 40 | 41 | timepoints.append((call, name, timestamp)) 42 | timepoints.append((ret, name, timestamp+duration)) 43 | 44 | # sort by the timestamp 45 | for timepoints in threads.values(): 46 | timepoints.sort(key=lambda t: (t[2])) 47 | 48 | data['threads'] = threads 49 | 50 | return data 51 | 52 | def print_thread(flow,line=-1): 53 | level = 0 54 | for i,point in enumerate(flow): 55 | what = point[0] 56 | name = point[1] 57 | if what == ret: 58 | level -= 1 59 | start = ' '*level 60 | if line>=0: 61 | if i= 150000 352 | 353 | func_start = None 354 | func_finish = None 355 | for what, func, when in threads[thread]: 356 | if func.startswith('sleep'): 357 | if what == call: 358 | func_start = when 359 | else: 360 | func_finish = when 361 | break 362 | assert func_start is not None and func_finish is not None 363 | print(' in sleep() for', func_finish - func_start) 364 | assert start > func_start and finish < func_finish 365 | 366 | def system(cmd): 367 | print('running',cmd) 368 | status = os.system(cmd) 369 | if 'killed' not in cmd: # we have a test that kills itself with SIGKILL - other than that commands shouldn't fail 370 | assert status==0, f'`{cmd}` failed with status {status}' 371 | 372 | BUILDDIR = './built-tests' 373 | OUTDIR = './out' 374 | TARGET = 'x86_64-unknown-linux-gnu' 375 | 376 | def build_trace_analysis_tools(): 377 | system(f'RUSTFLAGS="-C target-feature=+crt-static" cargo build -r --target {TARGET}') 378 | 379 | def run_cmds(cmds): 380 | for cmd in cmds: 381 | system(cmd) 382 | 383 | def build_cxx_test(main, shared=[], dyn_shared=[], flags=''): 384 | cmdlists = [] 385 | binaries = {} 386 | for mode in ['fi-gcc','fi-clang','pg','xray']: 387 | CXXFLAGS=f"-O3 -std=c++11 -Wall {flags}" 388 | if mode == 'xray': 389 | CXXFLAGS += " -fxray-instruction-threshold=1" 390 | compiler = { 391 | 'fi-gcc':'finstr-g++', 392 | 'fi-clang':'finstr-clang++', 393 | 'pg':'pg-g++', 394 | 'xray':'xray-clang++', 395 | } 396 | CXX = f'./compiler-wrappers/funtrace-{compiler[mode]}' 397 | test = main.split('.')[0] 398 | binary = f'{BUILDDIR}/{test}.{mode}' 399 | cmds = [] 400 | LIBS = '' 401 | DYNLIBS = '' 402 | if shared or dyn_shared: 403 | for cpp in shared+dyn_shared: 404 | module = cpp.split('.')[0] 405 | lib = f'{os.path.realpath(BUILDDIR)}/{module}.{mode}.so' 406 | cmds += [ 407 | f'{CXX} -c tests/{cpp} -o {BUILDDIR}/{module}.{mode}.o {CXXFLAGS} -I. -fPIC', 408 | f'{CXX} -o {lib} {BUILDDIR}/{module}.{mode}.o {CXXFLAGS} -fPIC -shared', 409 | ] 410 | if cpp in dyn_shared: 411 | DYNLIBS += ' '+lib 412 | else: 413 | LIBS += ' '+lib 414 | dlibs = '' 415 | if LIBS: 416 | dlibs = f'-DLIBS=\\"{DYNLIBS.strip()}\\"' 417 | cmds += [ 418 | f'{CXX} -c tests/{main} -o {BUILDDIR}/{test}.{mode}.o {CXXFLAGS} -I. {dlibs}', 419 | f'{CXX} -o {binary} {BUILDDIR}/{test}.{mode}.o {CXXFLAGS}{LIBS}', 420 | ] 421 | cmdlists.append(cmds) 422 | binaries.setdefault(test,list()).append(binary) 423 | return cmdlists, binaries 424 | 425 | def run_cxx_test(test, binaries): 426 | cmdlists = [] 427 | for binary in binaries: 428 | name = os.path.basename(binary) 429 | env = '' 430 | if 'xray' in binary: 431 | env = 'env XRAY_OPTIONS="patch_premain=true"' 432 | cmds = [ 433 | f'mkdir -p {OUTDIR}/{name}', 434 | f'cd {OUTDIR}/{name}; {env} ../../{binary}', 435 | ] 436 | if 'count' in test: 437 | cmds += [ 438 | f'./target/{TARGET}/release/funcount2sym {OUTDIR}/{name}/funcount.txt | c++filt > {OUTDIR}/{name}/symcount.txt' 439 | ] 440 | else: 441 | cmds += [ 442 | f'./target/{TARGET}/release/funtrace2viz {OUTDIR}/{name}/funtrace.raw {OUTDIR}/{name}/funtrace > {OUTDIR}/{name}/f2v.out' 443 | ] 444 | cmdlists.append(cmds) 445 | return cmdlists 446 | 447 | 448 | def main(): 449 | global pool 450 | pool = Pool() 451 | build_trace_analysis_tools() 452 | system(f'rm -rf {BUILDDIR}') 453 | system(f'rm -rf {OUTDIR}') 454 | system(f'mkdir -p {BUILDDIR}') 455 | 456 | cmdlists = [] 457 | test2bins = {} 458 | def buildcmds(*args,**kw): 459 | c,b = build_cxx_test(*args,**kw) 460 | cmdlists.extend(c) 461 | test2bins.update(b) 462 | 463 | buildcmds('ignore_disable.cpp') 464 | buildcmds('exceptions.cpp') 465 | buildcmds('untraced_catcher.cpp') 466 | buildcmds('untraced_funcs.cpp') 467 | buildcmds('longjmp.cpp') 468 | buildcmds('tailcall.cpp') 469 | buildcmds('orphans.cpp') 470 | buildcmds('buf_size.cpp') 471 | buildcmds('benchmark.cpp',flags=f'-funtrace-no-trace={os.path.realpath("tests/no-trace-bench.txt")}') 472 | buildcmds('freq.cpp') 473 | buildcmds('killed.cpp') 474 | buildcmds('sigtrap.cpp') 475 | buildcmds('ftrace.cpp') 476 | buildcmds('asm_filter.cpp',flags=f'-funtrace-instr-thresh=20 -funtrace-no-trace={os.path.realpath("tests/no-trace.txt")} -funtrace-do-trace={os.path.realpath("tests/do-trace.txt")}') 477 | buildcmds('asm_filter_2.cpp',flags=f'-funtrace-instr-thresh=20 -funtrace-ignore-loops') 478 | buildcmds('shared.cpp',shared=['lib_shared.cpp'],dyn_shared=['lib_dyn_shared.cpp']) 479 | buildcmds('count.cpp',shared=['count_shared.cpp'],dyn_shared=['count_dyn_shared.cpp'],flags='-DFUNTRACE_FUNCOUNT -DFUNCOUNT_PAGE_TABLES=2') 480 | buildcmds('c.c') 481 | pool.map(run_cmds, cmdlists) 482 | 483 | cmdlists = [] 484 | killedcmds = [] 485 | for test,binaries in test2bins.items(): 486 | cmds = killedcmds if 'killed' in test else cmdlists # we run killed later 487 | cmds.extend(run_cxx_test(test,binaries)) 488 | 489 | pool.map(run_cmds, cmdlists) 490 | check() 491 | 492 | pool.map(run_cmds, killedcmds) 493 | for binary in test2bins['killed']: 494 | if 'xray' in binary or 'clang' in binary: 495 | continue # my gdb is too old to parse the latest LLVM's DWARF; there's no better reason for this condition... 496 | check_funtrace_from_core_dump(binary) 497 | check_orphan_tracer_removal() 498 | 499 | jsonmod = json 500 | def check(): 501 | print('checking results...') 502 | 503 | def load_threads(json): 504 | return parse_perfetto_json(json)['threads'] 505 | def load_thread(json): 506 | return list(load_threads(json).values())[0] 507 | def load_ftrace(json): 508 | return jsonmod.load(open(json))['systemTraceEvents'] 509 | 510 | def jsons(test): return sorted(glob.glob(f'{OUTDIR}/{test}.*/funtrace.json')) 511 | 512 | # funtrace tests [except freq] 513 | for json in jsons('ignore_disable'): 514 | print('checking',json) 515 | threads = load_threads(json) 516 | assert len(threads) == 3 517 | for name,thread in threads.items(): 518 | if name in ['child1','child3']: 519 | assert verify_thread(thread, ignore_disable_child_ref) 520 | elif name == 'main': 521 | assert verify_thread(thread, ignore_disable_main_ref) 522 | else: 523 | assert False, f'unexpected thread name: {name}' 524 | for json in jsons('exceptions'): 525 | print('checking',json) 526 | assert verify_thread(load_thread(json), exceptions_ref) 527 | for json in jsons('untraced_catcher'): 528 | print('checking',json) 529 | ref = clean_untraced_caller_ref if 'fi-gcc' in json else (dirty_untraced_caller_ref if 'xray' not in json else dirty_untraced_catcher_xray_ref) 530 | assert verify_thread(load_thread(json), ref) 531 | for json in jsons('untraced_funcs'): 532 | print('checking',json) 533 | assert verify_thread(load_thread(json), untraced_funcs_ref) 534 | for json in jsons('longjmp'): 535 | print('checking',json) 536 | assert verify_thread(load_thread(json), longjmp_ref) 537 | for json in jsons('tailcall'): 538 | print('checking',json) 539 | assert verify_thread(load_thread(json), tailcall_clean_ref if 'fi-' in json else tailcall_dirty_ref) 540 | for json in jsons('orphans'): 541 | print('checking',json) 542 | assert verify_thread(load_thread(json), orphans_ref(json)) 543 | for json in jsons('buf_size'): 544 | print('checking',json) 545 | threads = load_threads(json) 546 | assert verify_thread(threads['event_buf_1'], buf_size_ref) 547 | num_f_calls = len([name for _,name,_ in threads['event_buf_16'] if name.startswith('f()')]) 548 | assert num_f_calls <= 16*2 and num_f_calls >= 14*2, f'wrong number of f calls: {num_f_calls}' 549 | for json in jsons('sigtrap'): 550 | print('checking',json) 551 | thread = load_thread(json) 552 | assert len([name for _,name,_ in thread if name.startswith('traced_func')]) >= 100 553 | for json in jsons('shared'): 554 | print('checking',json) 555 | for thread in load_threads(json).values(): 556 | assert verify_thread(thread, shared_ref) 557 | for json in jsons('asm_filter'): 558 | print('checking',json) 559 | if 'xray' not in json: # we don't support asm filtering for XRay 560 | assert verify_thread(load_thread(json), asm_filter_ref) 561 | for json in jsons('ftrace'): 562 | print('checking',json) 563 | ftrace = load_ftrace(json) 564 | threads = load_threads(json) 565 | check_ftrace(ftrace, threads) 566 | for json in jsons('c'): 567 | print('checking',json) 568 | assert verify_thread(load_thread(json), c_ref) 569 | 570 | # funcount test 571 | for symcount_txt in sorted(glob.glob(f'{OUTDIR}/count.*/symcount.txt')): 572 | print('checking',symcount_txt) 573 | check_count_results(symcount_txt) 574 | 575 | # check last... might fail intermittently because we sleep for more than we asked for 576 | # due to the machine being loaded or whatever 577 | for json in jsons('freq'): 578 | print('checking',json) 579 | t = load_thread(json) 580 | assert verify_thread(t, freq_ref) 581 | slept = t[1][-1]-t[0][-1] 582 | assert slept >= 1500 and slept < 1700, f'wrong sleeping time {slept}' 583 | 584 | def check_funtrace_from_core_dump(test): 585 | testdir = f'{OUTDIR}/{os.path.basename(test)}' 586 | # the test produces an empty trace with no samples to extract to funtrace.json 587 | tracejson = f'{testdir}/funtrace.json' 588 | assert not os.path.exists(tracejson) 589 | assert os.path.exists(f'{testdir}/core'), f'{testdir}/core not found - is your /proc/sys/kernel/core_pattern set to "core", and is core dump size unlimited in the shell?' 590 | 591 | system(f'cd {testdir} && gdb -q ../../{test} core -x ../../funtrace_gdb.py -ex funtrace -ex quit') 592 | system(f'./target/{TARGET}/release/funtrace2viz {testdir}/funtrace.raw {testdir}/funtrace') 593 | 594 | # core dump analysis should produce a sample that will be extraced to funtrace.json 595 | assert os.path.exists(tracejson) 596 | 597 | data = parse_perfetto_json(tracejson) 598 | threads = data['threads'] 599 | ftrace = data['systemTraceEvents'] 600 | assert 'sched_waking: comm=child', f'bad ftrace data:\n{ftrace}' 601 | 602 | # check that both the active and the recently finished thread were found 603 | assert len(threads) == 3 604 | assert 'child' in threads 605 | for thread in threads.values(): 606 | is_main = len([name for _,name,_ in thread if name.startswith('main')]) > 0 607 | if is_main: 608 | # we're checking, in particular, that after saving a snapshot we don't have "noise" trace entries from funtrace itself 609 | thread = [(what,name,when) for what,name,when in thread if '_GLOBAL__' not in name and '__static_initialization_and_destruction' not in name] 610 | ref = killed_main_ref if is_main else killed_children_ref 611 | assert verify_thread(thread, ref) 612 | 613 | def check_orphan_tracer_removal(): 614 | def funtrace_pid(s): 615 | try: 616 | t = s.split('.') 617 | assert len(t) == 2 and t[0]=='funtrace' 618 | return int(t[1]) 619 | except: 620 | return 0 621 | def find_tracers(): 622 | return [f for f in glob.glob('/sys/kernel/tracing/instances/funtrace.*') if funtrace_pid(os.path.basename(f))] 623 | tracers = find_tracers() 624 | assert len(tracers) >= 4, f'expected at least 4 funtrace ftrace instances, found {len(tracers)}: {tracers}' 625 | print('\n'.join(['orphan tracer instances:']+tracers)) 626 | 627 | # could be any funtrace-instrumented program - they clean orphan tracer dirs upon exit 628 | system(f'cd out/benchmark.pg; ../../{BUILDDIR}/benchmark.pg') 629 | for t in tracers: 630 | pid = funtrace_pid(os.path.basename(t)) 631 | # either the PID exists or the tracer was removed by the run of benchmark.pg 632 | assert os.path.exists('/proc/%d'%pid) or not os.path.exists(t) 633 | 634 | tracers = find_tracers() 635 | print('\n'.join(['orphan tracer instances:']+tracers)) 636 | 637 | if __name__ == '__main__': 638 | main() 639 | -------------------------------------------------------------------------------- /funtrace2viz/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{self, Read, Seek, SeekFrom}; 3 | use std::io::prelude::*; 4 | use std::mem; 5 | use bytemuck::{Pod, Zeroable}; 6 | use std::collections::{HashMap, HashSet}; 7 | use procaddr2sym::{ProcAddr2Sym, SymInfo}; 8 | use serde_json::Value; 9 | use clap::Parser; 10 | use std::cmp::{min, max}; 11 | use num::{FromPrimitive, Zero}; 12 | use num::rational::Ratio; 13 | use num::bigint::BigInt; 14 | 15 | const RETURN_BIT: i32 = 63; 16 | const RETURN_WITH_CALLER_ADDRESS_BIT: i32 = 62; 17 | const CATCH_MASK: u64 = (1< bool { ((n>>b)&1) != 0 } 24 | 25 | // Struct to represent a 16-byte FUNTRACE entry 26 | #[repr(C)] 27 | #[derive(Debug, Pod, Zeroable, Clone, Copy)] 28 | struct FunTraceEntry { 29 | address: u64, 30 | cycle: u64, 31 | } 32 | 33 | struct SourceCode { 34 | json_str: String, 35 | num_lines: usize, 36 | } 37 | 38 | #[derive(Parser)] 39 | #[clap(about="convert funtrace.raw to JSON files in the viztracer/vizviewer format (pip install viztracer; or use Perfetto but then you won't see source code)", version)] 40 | struct Cli { 41 | #[clap(help="funtrace.raw input file with one or more trace samples")] 42 | funtrace_raw: String, 43 | #[clap(help="basename.json, basename.1.json, basename.2.json... are created, one JSON file per trace sample")] 44 | out_basename: String, 45 | #[clap(short, long, help="print the static addresses and executable/shared object files of decoded functions in addition to name, file & line")] 46 | executable_file_info: bool, 47 | #[clap(short, long, help="print the raw timestamps (the default is to subtract the timestamp of the earliest reported event at each sample, so that time starts at 0; in particular it helps to avoid rounding issues you might see with large timestamp values)")] 48 | raw_timestamps: bool, 49 | #[clap(short, long, help="ignore events older than this relatively to the latest recorded event in a given trace sample (very old events create the appearance of a giant blank timeline in vizviewer/Perfetto which zooms out to show the recorded timeline in full)")] 50 | max_event_age: Option, 51 | #[clap(short, long, help="ignore events older than this cycle (like --max-event-age but as a timestamp instead of an age in cycles)")] 52 | oldest_event_time: Option, 53 | #[clap(short, long, help="dry run - only list the samples & threads with basic stats, don't decode into JSON")] 54 | dry: bool, 55 | #[clap(short, long, help="ignore samples with indexes outside this list")] 56 | samples: Vec, 57 | #[clap(short, long, help="ignore threads with TIDs outside this list (including for the purpose of interpreting --max-event-age)")] 58 | threads: Vec, 59 | } 60 | 61 | struct TraceConverter { 62 | procaddr2sym: ProcAddr2Sym, 63 | // we dump source code into the JSON files to make it visible in vizviewer 64 | source_cache: HashMap, 65 | sym_cache: HashMap, 66 | max_event_age: Option, 67 | raw_timestamps: bool, 68 | time_base: u64, 69 | oldest_event_time: Option, 70 | dry: bool, 71 | samples: Vec, 72 | threads: Vec, 73 | cpu_freq: u64, 74 | cmd_line: String, 75 | first_event_in_json: bool, 76 | first_event_in_thread: bool, 77 | num_events: i64, 78 | } 79 | 80 | #[repr(C)] 81 | #[derive(Debug, Pod, Zeroable, Clone, Copy)] 82 | struct ThreadID 83 | { 84 | pid: u64, 85 | tid: u64, 86 | name: [u8; 16], 87 | } 88 | 89 | struct ThreadTrace { 90 | thread_id: ThreadID, 91 | trace: Vec, 92 | } 93 | 94 | struct FtraceEvent { 95 | timestamp: u64, 96 | line: String, 97 | } 98 | 99 | fn parse_ftrace_lines(input: &String, transform_timestamp: impl Fn(u64) -> String) -> Vec { 100 | let mut results = Vec::new(); 101 | 102 | for line in input.lines() { 103 | // Find the timestamp section 104 | if let Some(colon_pos) = line.find(": ") { 105 | // Search backwards from colon to find the start of timestamp 106 | if let Some(space_before_ts) = line[..colon_pos].rfind(char::is_whitespace) { 107 | let timestamp_str = &line[space_before_ts + 1..colon_pos]; 108 | 109 | // Parse the timestamp 110 | if let Ok(timestamp) = timestamp_str.parse::() { 111 | // Split line into parts 112 | let before_ts = &line[..space_before_ts + 1]; 113 | let after_ts = &line[colon_pos..]; 114 | 115 | // Create modified line with transformed timestamp 116 | let modified_line = format!( 117 | "{}{}{}", 118 | before_ts, 119 | transform_timestamp(timestamp), 120 | after_ts 121 | ); 122 | 123 | results.push(FtraceEvent { 124 | timestamp, 125 | line: modified_line, 126 | }); 127 | } 128 | } 129 | } 130 | } 131 | 132 | results 133 | } 134 | 135 | fn rat2dec(rat: &Ratio, decimal_places: u32) -> String { 136 | let mut result = "".to_string(); 137 | let mut rational = rat.clone(); 138 | if rat < &Ratio::from_u64(0).unwrap() { //shouldn't happen in this program but let's print correctly if it does 139 | rational = -rat; 140 | result = "-".to_string(); 141 | } 142 | // Round - add 0.0..05 143 | let rounded = rational + Ratio::from_u64(5).unwrap() / Ratio::from_u64(10u64.pow(decimal_places+1)).unwrap(); 144 | 145 | // Get numerator and denominator 146 | let numerator = rounded.numer(); 147 | let denominator = rounded.denom(); 148 | 149 | // Perform division with extra precision to ensure accuracy 150 | let mut quotient = numerator / denominator; 151 | let mut remainder = numerator % denominator; 152 | 153 | // Build the decimal string 154 | result = result + "ient.to_string(); 155 | 156 | if !remainder.is_zero() { 157 | result.push('.'); 158 | 159 | // Calculate decimal digits 160 | for _ in 0..decimal_places { 161 | remainder *= 10; 162 | quotient = &remainder / denominator; 163 | remainder = &remainder % denominator; 164 | result.push_str("ient.to_string()); 165 | 166 | if remainder.is_zero() { 167 | break; 168 | } 169 | } 170 | } 171 | 172 | result 173 | } 174 | 175 | impl TraceConverter { 176 | pub fn new(args: &Cli) -> Self { 177 | TraceConverter { procaddr2sym: ProcAddr2Sym::new(), source_cache: HashMap::new(), sym_cache: HashMap::new(), 178 | max_event_age: args.max_event_age, raw_timestamps: args.raw_timestamps, time_base: 0, 179 | oldest_event_time: args.oldest_event_time, dry: args.dry, 180 | samples: args.samples.clone(), threads: args.threads.clone(), cpu_freq: 0, cmd_line: "".to_string(), 181 | first_event_in_json: false, first_event_in_thread: false, num_events: 0 182 | } 183 | } 184 | 185 | fn oldest_event(&self, sample_entries: &Vec, ftrace_events: &Vec) -> u64 { 186 | let mut youngest = 0; 187 | let mut oldest = u64::MAX; 188 | for entries in sample_entries { 189 | if self.threads.is_empty() || self.threads.contains(&entries.thread_id.tid) { 190 | oldest = min(entries.trace.first().unwrap().cycle, oldest); 191 | youngest = max(entries.trace.last().unwrap().cycle, youngest); 192 | } 193 | } 194 | if !ftrace_events.is_empty() { 195 | oldest = min(ftrace_events.first().unwrap().timestamp, oldest); 196 | youngest = max(ftrace_events.last().unwrap().timestamp, youngest); 197 | } 198 | if let Some(max_age) = self.max_event_age { 199 | youngest - max_age 200 | } 201 | else if let Some(oldest_to_report) = self.oldest_event_time { 202 | oldest_to_report 203 | } 204 | else { 205 | oldest 206 | } 207 | } 208 | 209 | //extra_ns shifts the return timestamp if positive or the call timestamp if negative 210 | fn write_function_call_event(&mut self, json: &mut File, call_sym: &SymInfo, call_cycle: u64, return_cycle: u64, extra_ns: i32, thread_id: &ThreadID, funcset: &mut HashSet) -> io::Result<()> { 211 | self.num_events += 1; 212 | if self.dry { 213 | return Ok(()); 214 | } 215 | if self.first_event_in_thread { 216 | let name: Vec<_> = thread_id.name.iter().filter(|&&x| x != 0 as u8).copied().collect(); 217 | json.write(format!(r#"{}{{"ph":"M","pid":{},"tid":{},"name":"thread_name","args":{{"name":{}}}}}"#, 218 | if self.first_event_in_json { "" } else { "\n," }, 219 | thread_id.pid,thread_id.tid,Value::String(String::from_utf8(name).unwrap()).to_string()).as_bytes())?; 220 | self.first_event_in_thread = false; 221 | self.first_event_in_json = false; 222 | 223 | if thread_id.pid == thread_id.tid { 224 | json.write(format!(r#"{}{{"ph":"M","pid":{},"tid":{},"name":"process_name","args":{{"name":{}}}}}"#, "\n,", 225 | thread_id.pid,thread_id.tid,Value::String(self.cmd_line.clone()).to_string()).as_bytes())?; 226 | } 227 | } 228 | //using f64 would lose precision for machines with an uptime > month since f64 stores 229 | //52 mantissa bits and TSC increments a couple billion times per second. 230 | //we use rational numbers instead 231 | let rat = |n: u64| Ratio::from_u64(n).unwrap(); 232 | let cycles_per_us = rat(self.cpu_freq) / rat(1000000); 233 | 234 | let (extra_ret, extra_call) = if extra_ns > 0 { 235 | (rat(extra_ns as u64) / rat(1000), rat(0)) 236 | } 237 | else { 238 | (rat(0), rat(-extra_ns as u64) / rat(1000)) 239 | }; 240 | 241 | let digits = 4; //Perfetto timeline has nanosecond precision - no point in printing 242 | //more digits than 3 for the microsecond timestamps it expects in the JSON; we print 4 243 | //for testing to make sure that cycles don't round to the same ns that should be distinct events 244 | 245 | if return_cycle != 0 && call_cycle != 0 { // a "complete" event (ph:X); these needn't be sorted by timestamp 246 | //note that we could have used the B and E events for "incomplete" function calls missing a call 247 | //or a return timestamp. however, the last orphan B event seems to be missing from Perfetto's rendering 248 | //and all of the orphan E events seem to be missing; B and E are apparently mostly designed to come in pairs 249 | //(despite the beautiful gradient that orphan B events are rendered with) 250 | json.write(format!(r#"{}{{"tid":{},"ts":{},"dur":{},"name":{},"ph":"X","pid":{}}}"#, "\n,", 251 | thread_id.tid, 252 | rat2dec(&(rat(call_cycle-self.time_base)/cycles_per_us.clone() - extra_call.clone()), digits), 253 | rat2dec(&(rat(return_cycle-call_cycle)/cycles_per_us + extra_call + extra_ret), digits), 254 | json_name(call_sym), thread_id.pid).as_bytes())?; 255 | } 256 | 257 | funcset.insert(call_sym.clone()); 258 | 259 | //cache the source code if it's the first time we see this file 260 | if !self.source_cache.contains_key(&call_sym.file) { 261 | let mut source_code: Vec = Vec::new(); 262 | if let Ok(mut source_file) = File::open(&call_sym.file) { 263 | source_file.read_to_end(&mut source_code)?; 264 | } 265 | else if call_sym.file != "??" { 266 | println!("WARNING: couldn't open source file {} - you can remap paths using a substitute-path.json file in your working directory", call_sym.file); 267 | } 268 | let json_str = Value::String(String::from_utf8(source_code.clone()).unwrap()).to_string(); 269 | let num_lines = source_code.iter().filter(|&&b| b == b'\n').count(); //TODO: num newlines 270 | //might be off by one relatively to num lines... 271 | self.source_cache.insert(call_sym.file.clone(), SourceCode{ json_str, num_lines }); 272 | } 273 | Ok(()) 274 | } 275 | 276 | fn write_sample_to_json(&mut self, fname: &String, sample_entries: &Vec, ftrace_text: &String) -> io::Result<()> { 277 | let mut json = if self.dry { File::open("/dev/null")? } else { File::create(fname)? }; 278 | if !self.dry { 279 | json.write(br#"{ 280 | "traceEvents": [ 281 | "#)?; 282 | println!("decoding a trace sample logged by `{}` into {} ...", self.cmd_line, fname); 283 | } 284 | else { 285 | println!("inspecting sample {} logged by `{}` (without creating the file...)", fname, self.cmd_line); 286 | } 287 | 288 | // we list the set of functions (to tell their file, line pair to vizviewer); 289 | // we also use this set to only dump the relevant part of the source cache to each 290 | // json (the source cache persists across samples/jsons but not all files are relevant 291 | // to all samples) 292 | let mut funcset: HashSet = HashSet::new(); 293 | self.first_event_in_json = true; 294 | let mut ignore_addrs: HashSet = HashSet::new(); 295 | 296 | let rat = |n: u64| Ratio::from_u64(n).unwrap(); 297 | //ftrace timestamps are supposed to be in seconds; CPU frequency is in TSC cycles per second; 298 | //so dividing by frequency will convert TSC to seconds. Perfetto timeline accuracy is ns 299 | //hence 10 digits after '.' (9 plus another to make sure different cycles don't become the same ns) 300 | let cycles_per_second = rat(self.cpu_freq); 301 | let fixts = |ts: u64| format!("{}", rat2dec(&(rat(ts)/cycles_per_second.clone()), 10)); 302 | let mut ftrace_events = parse_ftrace_lines(ftrace_text, fixts); 303 | 304 | let oldest = self.oldest_event(sample_entries, &ftrace_events); 305 | self.time_base = if self.raw_timestamps { 0 } else { oldest }; 306 | 307 | if self.time_base > 0 { 308 | //TODO: a bit wasteful to reparse this just to subtract the time base 309 | let fixts = |ts: u64| format!("{}", rat2dec(&(rat(ts-self.time_base)/cycles_per_second.clone()), 10)); 310 | ftrace_events = parse_ftrace_lines(ftrace_text, fixts); 311 | } 312 | 313 | ftrace_events.retain(|event| event.timestamp >= oldest); 314 | 315 | for thread_trace in sample_entries { 316 | let entries = &thread_trace.trace; 317 | if !self.threads.is_empty() && !self.threads.contains(&thread_trace.thread_id.tid) { 318 | println!("ignoring thread {} - not on the list {:?}", thread_trace.thread_id.tid, self.threads); 319 | continue; 320 | } 321 | let mut stack: Vec = Vec::new(); 322 | self.num_events = 0; 323 | let earliest_cycle = max(entries[0].cycle, oldest); 324 | let latest_cycle = entries[entries.len()-1].cycle; 325 | let mut num_orphan_returns = 0; 326 | self.first_event_in_thread = true; 327 | 328 | let mut expecting_to_return_into_sym = self.procaddr2sym.unknown_symbol(); 329 | 330 | for entry in entries { 331 | if oldest > entry.cycle { 332 | continue; //ignore old events 333 | } 334 | let catch = (entry.address & CATCH_MASK) == CATCH_MASK; 335 | let ret_with_caller_addr = bit_set(entry.address, RETURN_WITH_CALLER_ADDRESS_BIT) && !catch; 336 | let ret = (bit_set(entry.address, RETURN_BIT) || ret_with_caller_addr) && !catch; 337 | let addr = entry.address & ADDRESS_MASK; 338 | 339 | if !self.sym_cache.contains_key(&addr) { 340 | let sym = self.procaddr2sym.proc_addr2sym(addr); 341 | //we ignore "virtual override thunks" because they aren't interesting 342 | //to the user, and what's more, some of them call __return__ but not 343 | //__fentry__ under -pg, so you get spurious "orphan returns" (see below 344 | //how we handle supposedly "real" orphan returns.) 345 | if sym.demangled_func.contains("virtual override thunk") { 346 | ignore_addrs.insert(addr); 347 | } 348 | self.sym_cache.insert(addr, sym); 349 | } 350 | if ignore_addrs.contains(&addr) { 351 | continue; 352 | } 353 | //println!("{} {} sym {}", stack.len(), if catch { "catch" } else if ret { "ret" } else { "call" }, json_name(self.sym_cache.get(&addr).unwrap())); 354 | if catch { 355 | //pop the entries on the stack until we find the function which logged the catch entry. 356 | //if we don't find it, perhaps its call entry didn't make it into our trace, or, more 357 | //troublingly, it was compiled without instrumentation or something else went wrong which 358 | //will cause us to pop everything from the stack. but resetting the stack upon a catch 359 | //is probably less bad than leaving it as is since then it would keep growing with 360 | //every catch 361 | // 362 | //TODO: we could probably improve the handling of "uninstrumented catchers" by keeping 363 | //a history of the fully-popped stacks and then when a return arrives of a function 364 | //in one of these stacks that was "orphaned" by the throw/catch, we could find its call 365 | //entry in this history and reconstruct the call sequence. this could be done given demand; 366 | //ATM we just advise against compiling "catchers" without instrumentation. [note that 367 | //the improvement above would work some of the time but not always, eg because a return 368 | //of any of the catcher's caller wasn't traced, either because it didn't happen or 369 | //because the callers of the catcher were also uninstrumented - and this isn't a far-fetched 370 | //scenario, eg if you have some loop with the top-level code catching exceptions, 371 | //it might be running "indefinitely" so you won't see a return that would trigger the 372 | //logic above. so advising against uninstrumented catchers 373 | //will remain valid even if we add all the logic described above.] 374 | let catcher = self.sym_cache.get(&addr).unwrap().demangled_func.clone(); 375 | let mut unwound = 0; 376 | while !stack.is_empty() { 377 | let last = stack.last().unwrap(); 378 | if bit_set(last.address, CALL_RETURNING_UPON_THROW_BIT) { 379 | //this was traced with -finstrument-functions or "something" that would have 380 | //recorded a return event had it been returned from due to stack unwinding 381 | break; 382 | } 383 | let call_sym = self.sym_cache.get(&(last.address & ADDRESS_MASK)).unwrap(); 384 | if catcher == call_sym.demangled_func { //we don't compare by address since it could be two 385 | //different symbols - we entered "f(int)" and we are catching inside "f(int) [clone .cold]"; 386 | //procaddr2sym strips the [clone...] from the name so we can compare by it 387 | break; 388 | } 389 | //these all end at the same cycle contrary to the JSON spec's perfect nesting requirement; 390 | //unlike XRay we try to make them stand apart by 1 ns (the timeline's precision), also makes testing more straightforward 391 | unwound += 1; 392 | self.write_function_call_event(&mut json, &call_sym.clone(), last.cycle, entry.cycle, unwound, &thread_trace.thread_id, &mut funcset)?; 393 | stack.pop(); 394 | } 395 | continue; 396 | } 397 | if !ret { 398 | stack.push(*entry); 399 | } 400 | else { 401 | let ret_sym = self.sym_cache.get(&addr).unwrap().clone(); 402 | 403 | if stack.is_empty() { //an "orphan return" - the call wasn't in the trace 404 | num_orphan_returns += 1; 405 | //if ret_with_caller_addr, record the return into the function we're expecting to return into (might be unknown 406 | //or we could know by getting a previous return event with the caller's address) 407 | let sym = if ret_with_caller_addr { &expecting_to_return_into_sym } else { &ret_sym }; 408 | self.write_function_call_event(&mut json, sym, earliest_cycle, entry.cycle, -num_orphan_returns, &thread_trace.thread_id, &mut funcset)?; 409 | if ret_with_caller_addr { 410 | expecting_to_return_into_sym = ret_sym.clone(); 411 | } 412 | continue; 413 | } 414 | if ret_with_caller_addr { 415 | //this might be useful if we get an orphan return next 416 | expecting_to_return_into_sym = ret_sym.clone(); 417 | } 418 | 419 | let call_entry = stack.pop().unwrap(); 420 | let mut call_cycle = call_entry.cycle; 421 | let mut call_sym = self.sym_cache.get(&(call_entry.address & ADDRESS_MASK)).unwrap().clone(); 422 | //warn if we return to a different function from the one predicted by the call stack. 423 | //this "shouldn't happen" but it does unless we ignore "virtual override thunks" 424 | //and it's good to at least emit a warning when it does since the trace will look strange 425 | 426 | //warn if we're returning to a function different than predicted by the call stack, 427 | //and try to recover from the problem by popping from the stack until we find right function 428 | //(eg setjmp/longjmp can cause this problem). 429 | let mut returns = 0; 430 | if !ret_with_caller_addr { 431 | //comparing names instead of addresses because of the [clone ...] business - not sure if we can 432 | //call one clone and return into another but who knows, certainly catch returns to another clone at times 433 | if ret_sym.demangled_func != call_sym.demangled_func { 434 | println!(" WARNING: call/return mismatch - {} popped from the stack but {} returning", json_name(&call_sym), json_name(&ret_sym)); 435 | let mut found = false; 436 | while !found { 437 | self.write_function_call_event(&mut json, &call_sym.clone(), call_cycle, entry.cycle, returns, &thread_trace.thread_id, &mut funcset)?; 438 | if stack.is_empty() { 439 | break; 440 | } 441 | let last = stack.last().unwrap(); 442 | call_sym = self.sym_cache.get(&(last.address & ADDRESS_MASK)).unwrap().clone(); 443 | call_cycle = last.cycle; 444 | println!(" WARNING: popping {}", json_name(&call_sym)); 445 | stack.pop(); 446 | returns += 1; 447 | found = ret_sym.demangled_func == call_sym.demangled_func; 448 | } 449 | } 450 | } 451 | else if !stack.is_empty() { 452 | let ret_caller_sym = self.sym_cache.get(&(stack.last().unwrap().address & ADDRESS_MASK)).unwrap(); 453 | if ret_sym.demangled_func != ret_caller_sym.demangled_func && stack.iter().any(|&entry| self.sym_cache.get(&(entry.address & ADDRESS_MASK)).unwrap().demangled_func == ret_sym.demangled_func) { 454 | println!(" WARNING: call/return mismatch - {} called from {}, the returning function's caller is {}", json_name(&call_sym), json_name(ret_caller_sym), json_name(&ret_sym)); 455 | let mut found = false; 456 | while !found { 457 | self.write_function_call_event(&mut json, &call_sym.clone(), call_cycle, entry.cycle, returns, &thread_trace.thread_id, &mut funcset)?; 458 | if stack.is_empty() { 459 | break; 460 | } 461 | let last = stack.last().unwrap(); 462 | call_sym = self.sym_cache.get(&(last.address & ADDRESS_MASK)).unwrap().clone(); 463 | call_cycle = last.cycle; 464 | println!(" WARNING: popping {}", json_name(&call_sym)); 465 | stack.pop(); 466 | returns += 1; 467 | found = !stack.is_empty() && ret_sym.demangled_func == self.sym_cache.get(&(stack.last().unwrap().address & ADDRESS_MASK)).unwrap().demangled_func; 468 | } 469 | } 470 | } 471 | self.write_function_call_event(&mut json, &call_sym, call_cycle, entry.cycle, returns, &thread_trace.thread_id, &mut funcset)?; 472 | } 473 | } 474 | //if the stack isn't empty, record a call with a fake return cycle 475 | let mut fake_returns = stack.len() as i32; 476 | for entry in &stack { 477 | let call_sym = self.sym_cache.get(&(entry.address & ADDRESS_MASK)).unwrap(); 478 | self.write_function_call_event(&mut json, &call_sym.clone(), entry.cycle, latest_cycle, fake_returns, &thread_trace.thread_id, &mut funcset)?; 479 | fake_returns -= 1; 480 | } 481 | let name = String::from_utf8(thread_trace.thread_id.name.iter().filter(|&&x| x != 0 as u8).copied().collect()).unwrap(); 482 | if latest_cycle >= earliest_cycle { 483 | println!(" thread {} {} - {} recent function calls logged over {} cycles [{} - {}]", thread_trace.thread_id.tid, name, self.num_events, latest_cycle-earliest_cycle, earliest_cycle-self.time_base, latest_cycle-self.time_base); 484 | } 485 | else { 486 | println!(" skipping thread {} {} (all {} logged function entry/return events are too old)", thread_trace.thread_id.tid, name, entries.len()); 487 | } 488 | } 489 | if self.dry { 490 | return Ok(()) 491 | } 492 | 493 | json.write(b"],\n")?; 494 | 495 | if !ftrace_events.is_empty() { 496 | let joined: String = ftrace_events.iter().map(|e| e.line.clone() + "\n").collect(); 497 | 498 | json.write(br#""systemTraceEvents": "#)?; 499 | //# tracer: nop is something Perfetto doesn't seem to need but the Chromium trace 500 | //JSON spec insists is a must 501 | json.write(Value::String("# tracer: nop\n".to_string() + &joined).to_string().as_bytes())?; 502 | json.write(b",\n")?; 503 | 504 | let oldest_ftrace = ftrace_events[0].timestamp; 505 | let newest_ftrace = ftrace_events[ftrace_events.len()-1].timestamp; 506 | println!(" ftrace - {} events logged over {} cycles [{} - {}]", ftrace_events.len(), newest_ftrace-oldest_ftrace, oldest_ftrace-self.time_base, newest_ftrace-self.time_base); 507 | } 508 | 509 | // find the source files containing the functions in this sample's set 510 | let mut fileset: HashSet = HashSet::new(); 511 | for sym in funcset.iter() { 512 | fileset.insert(sym.file.clone()); 513 | } 514 | json.write(br#""viztracer_metadata": { 515 | "version": "0.16.3", 516 | "overflow": false, 517 | "producer": "funtrace2viz" 518 | }, 519 | "file_info": { 520 | "files": { 521 | "#)?; 522 | 523 | // dump the source code of these files into the json 524 | for (i, file) in fileset.iter().enumerate() { 525 | if let Some(&ref source_code) = self.source_cache.get(file) { 526 | json.write(Value::String(file.clone()).to_string().as_bytes())?; 527 | json.write(b":[")?; 528 | json.write(source_code.json_str.as_bytes())?; 529 | json.write(b",")?; 530 | json.write(format!("{}", source_code.num_lines).as_bytes())?; 531 | json.write(if i==fileset.len()-1 { b"]\n" } else { b"],\n" })?; 532 | } 533 | } 534 | json.write(br#"}, 535 | "functions": { 536 | "#)?; 537 | 538 | // tell where each function is defined 539 | for (i, sym) in funcset.iter().enumerate() { 540 | // line-3 is there to show the function prototype in vizviewer/Perfetto 541 | // (often the debug info puts the line at the opening { of a function 542 | // and then the prototype is not seen, it can also span a few lines) 543 | json.write(format!("{}:[{},{}]{}\n", json_name(sym), Value::String(sym.file.clone()).to_string(), if sym.line <= 3 { sym.line } else { sym.line-3 }, if i==funcset.len()-1 { "" } else { "," }).as_bytes())?; 544 | } 545 | json.write(b"}}}\n")?; 546 | 547 | Ok(()) 548 | } 549 | 550 | pub fn parse_chunks(&mut self, file_path: &String, json_basename: &String) -> io::Result<()> { 551 | let mut file = File::open(file_path)?; 552 | 553 | let mut sample_entries: Vec = Vec::new(); 554 | let mut num_json = 0; 555 | 556 | let mut thread_id = ThreadID { pid: 0, tid: 0, name: [0; 16] }; 557 | 558 | let mut ftrace_text = "".to_string(); 559 | 560 | self.procaddr2sym.input_source = Some(procaddr2sym::input_source(file_path.clone())); 561 | 562 | loop { 563 | //the file consists of chunks with an 8-byte magic string telling the chunk 564 | //type, followed by an 8-byte length field and then contents of that length 565 | let mut magic = [0u8; MAGIC_LEN]; 566 | if file.read_exact(&mut magic).is_err() { 567 | break; // End of file 568 | } 569 | 570 | let mut length_bytes = [0u8; LENGTH_LEN]; 571 | file.read_exact(&mut length_bytes)?; 572 | let chunk_length = usize::from_ne_bytes(length_bytes); 573 | 574 | if &magic == b"FUNTRACE" { 575 | if chunk_length != 8 { 576 | println!("warning: unexpected length {} for FUNTRACE chunk", chunk_length); 577 | file.seek(SeekFrom::Current(chunk_length as i64))?; 578 | continue; 579 | } 580 | let mut freq_bytes = [0u8; 8]; 581 | file.read_exact(&mut freq_bytes)?; 582 | self.cpu_freq = u64::from_ne_bytes(freq_bytes); 583 | } 584 | else if &magic == b"CMD LINE" { 585 | let mut cmd_bytes = vec![0u8; chunk_length]; 586 | file.read_exact(&mut cmd_bytes)?; 587 | self.cmd_line = String::from_utf8(cmd_bytes).unwrap(); 588 | } 589 | else if &magic == b"ENDTRACE" { 590 | if chunk_length != 0 { 591 | println!("warning: non-zero length for ENDTRACE chunk"); 592 | file.seek(SeekFrom::Current(chunk_length as i64))?; 593 | continue; 594 | } 595 | if !sample_entries.is_empty() || !ftrace_text.is_empty() { 596 | if self.samples.is_empty() || self.samples.contains(&num_json) { 597 | self.write_sample_to_json(&format_json_filename(json_basename, num_json), &sample_entries, &ftrace_text)?; 598 | } 599 | else { 600 | println!("ignoring sample {} - not on the list {:?}", num_json, self.samples); 601 | } 602 | num_json += 1; 603 | sample_entries.clear(); 604 | ftrace_text.clear(); 605 | } 606 | } 607 | else if &magic == b"PROCMAPS" { 608 | //the content of the dumping process's /proc/self/maps to use when 609 | //interpreting the next trace samples (until another PROCMAPS chunk is encountered) 610 | let mut chunk_content = vec![0u8; chunk_length]; 611 | file.read_exact(&mut chunk_content)?; 612 | self.procaddr2sym.set_proc_maps(chunk_content.as_slice()); 613 | //the symbol cache might have been invalidated if the process unloaded and reloaded a shared object 614 | self.sym_cache = HashMap::new(); 615 | } else if &magic == b"THREADID" { 616 | if chunk_length != std::mem::size_of::() { 617 | println!("Unexpected THREAD chunk length {} - expecting {}", chunk_length, std::mem::size_of::()); 618 | file.seek(SeekFrom::Current(chunk_length as i64))?; 619 | continue; 620 | } 621 | 622 | file.read_exact(bytemuck::bytes_of_mut(&mut thread_id))?; 623 | } else if &magic == b"TRACEBUF" { 624 | if chunk_length % mem::size_of::() != 0 { 625 | println!("Invalid TRACEBUF chunk length {} - must be a multiple of {}", chunk_length, mem::size_of::()); 626 | file.seek(SeekFrom::Current(chunk_length as i64))?; 627 | continue; 628 | } 629 | 630 | let num_entries = chunk_length / mem::size_of::(); 631 | let mut entries = ThreadTrace { thread_id, trace: vec![FunTraceEntry { address: 0, cycle: 0 }; num_entries] }; 632 | file.read_exact(bytemuck::cast_slice_mut(&mut entries.trace))?; 633 | entries.trace.retain(|&entry| !(entry.cycle == 0 && entry.address == 0)); 634 | if !entries.trace.is_empty() { 635 | entries.trace.sort_by_key(|entry| entry.cycle); 636 | sample_entries.push(entries); 637 | } 638 | } else if &magic == b"FTRACETX" { 639 | let mut ftrace_bytes = vec![0u8; chunk_length]; 640 | file.read_exact(&mut ftrace_bytes)?; 641 | ftrace_text = String::from_utf8(ftrace_bytes).unwrap(); 642 | } else { 643 | println!("Unknown chunk type: {:?}", std::str::from_utf8(&magic).unwrap_or("")); 644 | file.seek(SeekFrom::Current(chunk_length as i64))?; 645 | } 646 | } 647 | if !sample_entries.is_empty() || !ftrace_text.is_empty() { 648 | println!("warning: FUNTRACE block not closed by ENDTRACE"); 649 | self.write_sample_to_json(&format_json_filename(json_basename, num_json), &sample_entries, &ftrace_text)?; 650 | } 651 | 652 | Ok(()) 653 | } 654 | } 655 | 656 | fn format_json_filename(basename: &String, number: u32) -> String { 657 | if number > 0 { 658 | format!("{}.{}.json", basename, number) 659 | } else { 660 | format!("{}.json", basename) 661 | } 662 | } 663 | 664 | static mut PRINT_BIN_INFO: bool = false; 665 | 666 | fn json_name(sym: &SymInfo) -> String { 667 | //"unsafe" access to a config parameter... I guess I should have put stuff into a struct and have 668 | //most methods operate on it to make it prettier or something?.. 669 | let print_bin_info = unsafe { PRINT_BIN_INFO }; 670 | if print_bin_info { 671 | Value::String(format!("{} ({}:{} {:#x}@{})", sym.demangled_func, sym.file, sym.line, sym.static_addr, sym.executable_file)).to_string() 672 | } 673 | else { 674 | Value::String(format!("{} ({}:{})", sym.demangled_func, sym.file, sym.line)).to_string() 675 | } 676 | } 677 | 678 | fn main() -> io::Result<()> { 679 | let args = Cli::parse(); 680 | if args.max_event_age.is_some() && args.oldest_event_time.is_some() { 681 | panic!("both --max-event-age and --oldest-event-time specified - choose one"); 682 | } 683 | unsafe { 684 | PRINT_BIN_INFO = args.executable_file_info; 685 | } 686 | let mut convert = TraceConverter::new(&args); 687 | convert.parse_chunks(&args.funtrace_raw, &args.out_basename) 688 | } 689 | 690 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # funtrace - a C/C++ function call tracer for x86/Linux 2 | 3 | A function call tracer is a kind of profiler showing **a timeline of function call and return events**. Here's an example trace captured by funtrace from [Krita](https://krita.org): 4 | 5 | ![image](images/krita-trace.png) 6 | 7 | Here we can see 2 threads - whether they're running or waiting, and the changes to their callstack over time - and the source code of a selected function. 8 | 9 | Unlike a sampling profiler such as perf, **a tracing profiler must be told what to trace** using some runtime API, and also has a **higher overhead** than the fairly low-frequency sampling of the current callstack a-la perf. What do you get in return for the hassle and the overhead (and the hassle of culling the overhead, by disabling tracing of short functions called very often)? Unlike flamegraphs showing where the program spends its time on average, traces let you **debug cases of unusually high latency**, including in production (and it's a great idea to collect traces in production, and not just during development!) 10 | 11 | If you're interested in why tracing profilers are useful and how funtrace works, see [Profiling in production with function call traces](https://yosefk.com/blog/profiling-in-production-with-function-call-traces.html). What follows is a funtrace user guide. 12 | 13 | - [Why funtrace?](#why-funtrace) 14 | - [Trying funtrace](#trying-funtrace) 15 | - [Runtime API for taking & saving trace snapshots](#runtime-api-for-taking--saving-trace-snapshots) 16 | - ["Coretime" API for saving trace snapshots](#coretime-api-for-saving-trace-snapshots) 17 | - [Choosing a compiler instrumentation method](#choosing-a-compiler-instrumentation-method) 18 | - [Integrating funtrace into your build system](#integrating-funtrace-into-your-build-system) 19 | - [Culling overhead with `funcount`](#culling-overhead-with-funcount) 20 | - [Decoding traces](#decoding-traces) 21 | - [Compile time & runtime configuration](#compile-time--runtime-configuration) 22 | - [Controlling which functions are traced](#controlling-which-functions-are-traced) 23 | - [Disabling & enabling tracing](#disabling--enabling-tracing) 24 | - [Controlling buffer sizes & lifetimes](#controlling-buffer-sizes--lifetimes) 25 | - [Limitations](#limitations) 26 | - [Funtrace file format](#funtrace-file-format) 27 | 28 | # Why funtrace? 29 | 30 | * **Low overhead tracing** - FWIW, in my microbenchmark I get <10 ns per instrumented call or return 31 | * **6x faster** than an LLVM XRay microbenchmark with "flight recorder logging" and 15-18x faster than "basic logging" 32 | * **4.5x faster** than a uftrace microbenchmark (note that uftrace isn't just designed for a somewhat different workflow than funtrace - in that it's similar to XRay - but it also has many more features; [check it out](https://github.com/namhyung/uftrace)!) 33 | * Supports **threads, shared libraries and exceptions** 34 | * Supports ftrace events, showing **thread scheduling states** alongside function calls & returns, so you see when time is spent waiting as opposed to computing 35 | * Works with **stock gcc or clang** - no custom compilers or compiler passes 36 | * Easy to integrate into a build system, and even easier to **try *without* touching the build system** using tiny compiler-wrapping scripts “passing all the right flags” 37 | * Small (just ~1K LOC for the runtime) and thus: 38 | * **easy to port** 39 | * **easy to extend** (say, to support some variant of “green threads”/fibers) 40 | * **easy to audit** in case you’re reluctant to add something intrusive like this into your system without understanding it well (as I personally would be!) 41 | * **Relatively comprehensive** – it comes with its own **tool for finding and cutting instrumentation overhead** in test runs too large to fully trace; 42 | support for remapping file paths to locate debug information and source code; a way to **extract trace data from core dumps**, etc. 43 | 44 | # Trying funtrace 45 | 46 | You can clone the repo & build the trace decoder (or uinzip [a binary release](https://github.com/yosefk/funtrace/releases)), compile & run a simple example program, and decode its output traces as follows: 47 | 48 | ``` shell 49 | # clone the source... 50 | git clone https://github.com/yosefk/funtrace 51 | # ...or unzip a binary release 52 | unzip funtrace.zip 53 | 54 | cd funtrace 55 | ./simple-example/build.sh 56 | ./simple-example/run.sh 57 | ``` 58 | 59 | This actually tests 4 different instrumented builds - 2 with gcc and 2 with clang; we'll discuss below how to choose the best method for you. Troubleshooting: 60 | 61 | * With an older clang, you'll get `clang: error: unknown argument: '-fxray-shared'` - in that case, you can use 3 instrumentation methods out of the 4. 62 | * You might have issues accessing ftrace data. This is not a problem for _function tracing_ but it prevents _thread state tracing_, which could tell us when threads are running and when they're waiting: 63 | 64 | ``` 65 | WARNING: funtrace - error initializing ftrace (...), compile with -DFUNTRACE_FTRACE_EVENTS_IN_BUF=0 66 | or run under `env FUNTRACE_FTRACE_EVENTS_IN_BUF=0` if you don't want to collect ftrace / see this warning 67 | ``` 68 | 69 | You can ignore this message, or disable ftrace as described in the message, or you can try making ftrace work. The problem is usually permissions, and one way to make ftrace usable permissions-wise is **`sudo chown -R $USER /sys/kernel/tracing`**. Inside containers, things are more involved, and you might want to consult a source knowing more than this guide. 70 | 71 | You can view the traces produced from the simple example above as follows: 72 | 73 | ``` 74 | pip install viztracer 75 | rehash 76 | vizviewer out/funtrace-fi-gcc.json 77 | vizviewer out/funtrace-pg.json 78 | vizviewer out/funtrace-fi-clang.json 79 | vizviewer out/funtrace-xray.json 80 | ``` 81 | 82 | Funtrace uses [viztracer](https://github.com/gaogaotiantian/viztracer) for visualizing traces, in particular because of its ability to show source code, unlike stock [Perfetto](https://perfetto.dev/) (the basis for vizviewer.) 83 | 84 | To build your own program with tracing enabled, you can use `compiler-wrappers/funtrace-pg-g++`, `compiler-wrappers/funtrace-finstr-clang++` or the other two compiler wrapper scripts, just like `simple-example/build.sh` does. If the program uses autoconf/configure, you can set the `$CXX` env var to point to one of these scripts, and if it uses cmake, you can pass `-DCMAKE_CXX_COMPILER=/your/chosen/wrapper` to cmake. 85 | 86 | Note that the compiler wrappers slow down the configuration stage, because they compile & link funtrace.cpp, and this is costly at build system config time if the build system compiles many small programs to test for compiler features, library availability and such. For the build itself, the overhead of compiling funtrace.cpp is lower, but might still be annoying if you use a fast linker like mold and are used to near-instantaneous linking. The good thing about the compiler wrappers is that they make trying funtrace easy; if you decide to use funtrace in your program, however, you will probably want to pass the required compiler flags yourself as described below, which will eliminate the build-time overhead of the compiler wrappers. 87 | 88 | Once the program compiles, you can run it as usual, and then `killall -SIGTRAP your-program` (or `kill -SIGTRAP `) when you want to get a trace. The trace will go to `funtrace.raw`; if you use SIGTRAP multiple times, many trace samples will be written to the file. Now you can run `funtrace2viz` the way `simple-example/run.sh` does. You get the funtrace2viz binary from `funtrace.zip`; if you cloned the source repo, you should have funtrace2viz compiled if you ran `simple-example/build.sh`. funtrace2viz will produce a vizviewer JSON file from each trace sample in funtrace.raw, and you can open each JSON file in vizviewer. 89 | 90 | Troubleshooting vizviewer issues: 91 | 92 | * If you see **`Error: RPC framing error`** in the browser tab opened by vizviewer, **reopen the JSON from the web UI**. (Note that you want to run vizviewer on every new JSON file, _even if_ it gives you "RPC framing error" when you do it - you _don't_ want to just open the JSON from the web UI since then you won't see source code!) 93 | * If **the timeline looks empty**, it's likely due to some mostly-idle threads having very old events causing the timeline to zoom out too much. (You can simply open the JSON with `less` or whatever - there's a line per function call; if the JSON doesn't look empty, funtrace is working.) **Try passing `--max-event-age` or `--oldest-event-time` to funtrace2viz**; it prints the time range of events recorded for each thread in each trace sample (by default, the oldest event in every sample gets the timestamp 0) and you can use these printouts to decide on the value of the flags. In the next section we'll discuss how to take snapshots at the time you want, of the time range you want, so that you needn't fiddle with flags this way. 94 | 95 | If you build the program, run it, and decode its trace on the same machine/in the same container, life is easy. If not, note that in order for funtrace2viz to work, you need the program and its shared libraries to be accessible at the paths where they were loaded from _in the traced program run_, on the machine _where funtrace2viz runs_. And to see the source code of the functions (as opposed to just function names), you need the source files to be accessible on that machine, at the paths _where they were when the program was built_. If this is not the case, you can remap the paths using a file called `substitute-path.json` in the current directory of funtrace2viz, as described below. 96 | As a side note, if you don't like having to remap source file paths - not just in funtrace but eg in gdb - see [refix](https://github.com/yosefk/refix) which can help to mostly avoid this. 97 | 98 | Note that if you choose to try XRay instrumentation (`compiler-wrappers/funtrace-xray-clang++`), you need to run with `env XRAY_OPTIONS="patch_premain=true"` like simple-examples/run.sh does. With the other instrumentation options, tracing is on by default. 99 | 100 | The above is how you can give funtrace a quick try. The rest tells how to integrate it in your program "for real." 101 | 102 | # Runtime API for taking & saving trace snapshots 103 | 104 | The next thing after trying funtrace with SIGTRAP is probably using the runtime API to take snapshots of interesting time ranges. (Eventually you'll want proper build system integration - but you probably want to "play some more" beforehand, and since snapshots taken with SIGTRAP aren't taken at "the really interesting times" and capture too much, you'll want to see better snapshots.) 105 | 106 | The recommended method for taking & saving snapshots is: 107 | 108 | * using `funtrace_time()` to find unusually high latency in every flow you care about 109 | * ...then use `funtrace_pause_and_get_snapshot_starting_at_time()` to capture snapshots when a high latency is observed 110 | * ...finally, use `funtrace_write_snapshot()` when you want to save the snapshot(s) taken upon the highest latencies 111 | 112 | In code, it looks something like this: 113 | 114 | ```c++ 115 | #include "funtrace.h" 116 | 117 | void Server::handleRequest() { 118 | uint64_t start_time = funtrace_time(); 119 | 120 | doStuff(); 121 | 122 | uint64_t latency = funtrace_time() - start_time; 123 | if(latency > _slowest) { 124 | funtrace_free_snapshot(_snapshot); 125 | _snapshot = funtrace_pause_and_get_snapshot_starting_at_time(start_time); 126 | _slowest = latency; 127 | } 128 | } 129 | 130 | Server::~Server() { 131 | funtrace_write_snapshot("funtrace-request.raw", _snapshot); 132 | funtrace_free_snapshot(_snapshot); 133 | } 134 | ``` 135 | 136 | There's also `funtrace_pause_and_get_snapshot_up_to_age(max_event_age)` - very similar to `funtrace_pause_and_get_snapshot_starting_at_time(start_time)`; and if you want the full content of the trace buffers without an event age limit, there's `funtrace_pause_and_get_snapshot()`. And you can write the snapshot straight from the threads' trace buffers to a file, without allocating memory for a snapshot, using `funtrace_pause_and_write_current_snapshot()` (this is exactly what the SIGTRAP handler does.) 137 | 138 | As implied by their names, **all of these functions pause tracing until they're done** (so that traced events aren't overwritten with new events before we have the chance to save them.) This means that, for example, a concurrent server where `Server::handleRequest()` is called from multiple threads might have a gap in one of the snapshots taken by 2 threads at about the same time; hopefully, unusual latency in 2 threads at the same time is rare, and even if does happen, you'll get at least one good snapshot. 139 | 140 | All of the snapshot-saving functions write to files; an interface for sending the data to some arbitrary stream could be added given demand. 141 | 142 | Finally, a note on the time functions: 143 | 144 | * `funtrace_time()` is a thin wrapper around `__rdtsc()` so you needn't worry about its cost 145 | * `funtrace_ticks_per_second()` gives you the TSC frequency in case you want to convert timestamps or time diffs to seconds/ns 146 | 147 | # "Coretime API" for saving trace snapshots 148 | 149 | While we're on the subject of snapshots - you can get trace data from a core dump by loading `funtrace_gdb.py` from gdb - by running `gdb -x funtrace_gdb.py`, or using the gdb command `python execfile("funtrace_gdb.py")`, or somewhere in `.gdbinit`. Then you'll get the extension command `funtrace` which works something like this: 150 | 151 | ``` 152 | (gdb) funtrace 153 | funtrace: saving proc mappings 154 | funtrace: core dump generated by `your-program arg1 arg2` 155 | funtrace: thread 1287700 your-program - saving 1048576 bytes of data read from 0x7fb199c00000 156 | funtrace: thread 1287716 child - saving 1048576 bytes of data read from 0x7fb17c200000 157 | funtrace: saving 22 ftrace events 158 | funtrace: done - decode with `funtrace2viz funtrace.raw out` and then view in viztracer (pip install viztracer) with `vizviewer out.json` 159 | ``` 160 | 161 | Basically it's what SIGTRAP would save to `funtrace.raw`, had it been called right when the core was dumped. Can be very useful to see what the program was doing right before it crashed. 162 | 163 | # Choosing a compiler instrumentation method 164 | 165 | Once you have snapshots of the right time ranges, you might want to settle on a particular compiler instrumentation method. For that, the below can be helpful as well as the next section, which talks about culling overhead with the `funcount` tool (one thing which will help you choose the instrumentation method is how much overhead it adds, which differs between programs, and funcount can help estimate that overhead.) 166 | 167 | Funtrace relies on the compiler inserting hooks upon function calls and returns. Funtrace supports 4 instrumentation methods (2 for gcc and 2 for clang), and comes with a compiler wrapper script passing the right flags to use each: 168 | 169 | * **funtrace-finstr-g++** - gcc with `-finstrument-functions` 170 | * **funtrace-pg-g++** - gcc with `-pg -mfentry -minstrument-return=call` 171 | * **funtrace-finstr-clang++** - clang with `-finstrument-functions` 172 | * **funtrace-xray-clang++** - clang with `-fxray-instrument` 173 | 174 | **"By default," the method used by funtrace-pg-g++ and funtrace-finstr-clang++ is recommended for gcc and clang, respectively**. However, for each compiler, there are reasons to use the other method. Here's a table of the methods and their pros and cons, followed by a detailed explanation: 175 | 176 | Method | gcc -finstr | gcc -pg | clang -finstr | clang XRay 177 | --- | --- | --- | --- | --- 178 | before or after inlining? | ❌ before | ✅ after | ✅✅ before or after! | ✅ after 179 | control tracing by source path | ✅ yes | ❌ no | ❌ no | ❌ no 180 | control tracing by function length | ✅ asm | ✅ asm | ✅ asm | ✅✅ compiler 181 | control tracing by function name list | ✅ asm | ✅ asm | ✅ asm | ❌ no 182 | tail call artifacts | ✅ no | ❌ yes | ✅ no | ❌ yes 183 | untraced exception catcher artifacts | ✅ no | ❌ yes | ❌ yes | ❌ yes 184 | needs questionable linker flags | ✅ no | ❌ yes | ✅ no | ❌ yes 185 | 186 | We'll now explain these items in detail, and add a few points about XRay which "don't fit into the table." 187 | 188 | * **Instrument before or after inlining?** You usually prefer "after" - "before" is likely to hurt performance too much (and you can use the NOFUNTRACE macro to suppress the tracing of a function, but you'll need to do this in too many places.) Still, instrumenting before inlining has its uses, eg you can trace the program flow and follow it in vizviewer - for an interactive and/or multithreaded program, this might be easier than using a debugger or an IDE. clang -finstrument-functions is the nicest here - it instruments before inlining, but has a sister flag -finstrument-functions-after-inlining that does what you expect. 189 | * **Control tracing by source path** - gcc's `-finstrument-functions-exclude-file-list=.h,.hpp,/usr/include` (for example) will disable tracing in functions with filenames having the substrings on the comma-separated list. This can somewhat compensate for -finstrument-functions instrumenting before inlining, and you might otherwise use this feature for "targeted tracing." 190 | * **Control tracing by function length** - XRay has `-fxray-instruction-threshold=N` which excludes short functions from tracing, unless they have loops that XRay assumes will run for a long time. For other instrumentation methods, funtrace comes with its own flag, `-funtrace-instr-thresh=N`, which is implemented by post-processing the assembly code produced by the compiler (funtrace supplies a script, `funtrace++`, which calls the compiler with `-S` instead of `-c` and then post-processes the assembly output and assembles it to produce the final `.o` object file.) XRay's method has 2 advantages, however. Firstly, it removes 100% of the overhead, while funtrace's method removes most (the on-entry/return hooks aren't called), but not all overhead (some extra instructions will appear relatively to the case where the function wasn't instrumented by the compiler in the first place.) Secondly, while the rest of funtrace is very solid, this bit is "hacky"/somewhat heuristical text processing of your compiler-generated assembly, and while it "seems to work" on large programs, you might have reservations against using this in production. 191 | * **Control tracing by function name list** - for all methods other than XRay instrumentation, funtrace provides the flags `-funtrace-do-trace=file` and `-funtrace-no-trace=file` which let you specify which functions to exclude - or not to exclude - from tracing during assembly postprocessing (if you decide to use this postprocessing, of course.) This is nice for functions coming from .h files you cannot edit (and thus can't add the `NOFUNTRACE` attribute to the functions you want to exclude); it can also be nice to take a bunch of "frequently callees" reported by the funcount tool (described below) and suppress them using a list of mangled function names, instead of going to the source location of each and adding `NOFUNTRACE` there, especially during experimentation where you trying to check what suppressing this or that does for the overhead. This doesn't work for XRay ATM (assembly postprocessing could probably be implemnted for XRay but would require editing compiler-generated metdata used by the XRay runtime.) 192 | * **Tail call artifacts** is when f calls g, the last thing g does is calling h, and instead of seeing f calling g _which calls h_, you see f calling g _and then h_. This happens because the compiler calls the "on return" hook from g before g's tail call to h. An annoyance if not a huge deal. 193 | * **Untraced exception catcher artifacts** is when you have a function with a `try/catch` block _and_ tracing is disabled for it. In such a case, when an exception is thrown & caught, it looks like _all_ the functions returned and you start from a freshly empty call stack - instead of the correct picture (returning to the function that caught the exception.) This artifact comes from most instrumentation methods not calling the "on return" hook when unwinding the stack. This annoyance is avoided as long as you enable tracing for functions catching exceptions (in which case funtrace traces enough info to get around the return hook not being called upon unwinding.) 194 | * **Questionable linker flags**: 195 | * **clang XRay requires --allow-multiple-definition**. That's because funtrace needs to redefine XRay's on-call/on-return hooks, and there doesn't seem to be another way to do it. If XRay defines its hooks as "weak", this flag will no longer be needed. 196 | * **gcc -pg _precludes_ -Wl,--no-undefined**. That's because its on-return hook, `__return__`, doesn't have a default definition (though its on-entry hook, `__fentry__`, apprently does, as do the entry/return hooks called by -finstrument-functions); your shared objects will get it from the executable but they won't link with `-Wl,--no-undefined`. Note that _all_ the wrappers filter out `-Wl,--no-undefined` so that shared libraries can use the `funtrace_` runtime APIs exported by the executable. However, you don't have to use the runtime APIs in shared objects - you can take snapshots only from code linked into the executable - so except for the -pg mode, this flag is not strictly necessary. 197 | 198 | A few more words about XRay: 199 | 200 | * **XRay instrumentation was enabled in shared libraries in late 2024** and is not yet available in officially released versions. clang versions with XRay shared library support have the `-fxray-shared` flag. 201 | * **XRay uses dynamic code patching for enabling/disabling tracing at runtime.** This is why tracing is off unless you run under `env XRAY_OPTIONS="patch_premain=true"`, or use XRay's runtime APIs to patch the code. Funtrace has its own API, `funtrace_enable/disable_tracing()`, but it deliberately _doesn't_ call XRay's code-patching APIs. Funtrace's API is a quick way to cut most of the overhead of tracing without any self-modifying code business. It's up to you to decide, if you use XRay, whether you want to cut even more overhead by using runtime patching - downsides include creating copies of the code pages, for which you might not have the extra space, and taking more time than funtrace_enable/disable_tracing(). 202 | 203 | # Integrating funtrace into your build system 204 | 205 | You can postpone "real" build system integration for as long as you want, if the compiler wrappers don't slow things down too much for you. 206 | Once you do want to integrate funtrace into your build system, the short story is, **choose an instrumentation method and then compile in the way the respective wrapper in compiler-wrappers does.** However, here are some points worth noting explicitly: 207 | 208 | * **It's fine to compile funtrace.cpp with its own compilation command.** You probably don't want to compile funtrace.cpp when linking your binary the way the wrappers do. They only do it to save you the trouble of adding funtrace.cpp to the list of files for the build system to build (which is harder/more annoying than it sounds, if you're trying to trace someone else's program with a build system you don't really know.) 209 | * **It's best to compile funtrace.cpp without tracing, but "it can handle" being compiled with tracing.** Many build systems make it hard to compile a given file with its own compiler flags. funtrace.cpp uses NOFUNTRACE heavily to suppress tracing; the worst that can happen if you compile it with tracing is that some of its code will be traced despite its best efforts, but it should otherwise work. 210 | * **funtrace.cpp must be compiled _into the executable_, not any of the shared libraries.** Funtrace uses TLS (thread-local storage) and accessing a `thread_local` object is a simple register+offset access when you link the code into an executable, but requires a function call if you link the code into a shared library, because now you need to find _this shared library's TLS area_. So funtrace puts its on-entry/return hooks into the executable, which exports them to the shared libraries. 211 | * **Linker flag requirements** (XRay/`--allow-multiple-definition`, -pg/`-Wl,--no-undefined`) are documented in the previous section; for XRay, you also **need a linker wrapper** like `compiler-wrappers/xray/ld` to make sure funtrace's on-entry/return hooks from funtrace.o are passed before XRay's own hooks on the linker command line. 212 | * **Pass -pthread** or things will break annoyingly 213 | * **-Wl,--dynamic-list=funtrace.dyn** exports the funtrace runtime API from the executable for the shared libraries 214 | * **-g is for source line info** (it's generally a good idea to use -g in release builds and not just debug builds - if it slows down linking, mold takes care of that; but, if you don't want to compile with -g, funtrace will still give you the function names using the ELF symbol table, only the source code will be missing from vizviewer) 215 | * **Do _not_ pass -pg _to the linker_** - if you use gcc with -pg, and do pass it to the linker, the linker will think that you're compiling for gprof (even if you also pass `-mfentry -minstrument-return=call` which are guaranteed to break gprof, -pg's original application...), and then your program will write a useless gmon.out file in the current directory every time you run it. 216 | * **Some flags in the wrappers are "defaults" that you can change**, specifically: 217 | * `g++ -finstrument-functions-exclude-file-list=.h,.hpp,/usr/include` - of course you can pass a different exclude list 218 | * `clang++ -finstrument-functions-after-inlining` - you can instead pass -finstrument-functions to instrument before inlining 219 | * `-fxray-instruction-threshold=...` is _not_ passed by the XRay wrapper - you can set your own threshold 220 | * **Link the program as C++** - even if it's a C program, the funtrace runtime is in C++ and you'll need to link with g++ or clang++ for things to work 221 | 222 | All the compiler wrappers execute `compiler-wrappers/funtrace++`, itself a compiler wrapper which implements a few flags - `-funtrace-instr-thresh=N`, `-funtrace-ignore-loops`, `-funtrace-do-trace=file`, and `-funtrace-no-trace=file` - for controlling which function get traced, by changing the assembly code produced by the compiler. If you don't need any of these flags, you needn't prefix your compilation command with `funtrace++` like the wrappers do. (Funtrace needn't touch the code generated by the compiler for any reason other than supporting these flags.) 223 | 224 | # Culling overhead with `funcount` 225 | 226 | If tracing slows down your program too much, you might want to exclude some functions from tracing. You can do this on some "wide basis", such as "no tracing inside this bunch of libraries, we do compile higher-level logic to trace the overall flow" or such. You can also use `-fxray-instruction-threshold` or `-funtrace-instr-thresh` to automatically exclude short functions without loops. But you might also want to do some "targeted filtering" where you **find functions called very often, and exclude those** (to save both cycles and space in the trace buffer - with many short calls, you need a much larger snapshot to see far enough into the past.) 227 | 228 | `funcount` is a tool for counting function calls, which is recommended for finding "frequent callees" to exclude from traces. Funcount is: 229 | 230 | * **Fast** (about as fast as funtrace and unlike the very slow callgrind) 231 | * **Accurate** (unlike perf which doesn't know how many time a function was called, only how many cycles were spent there and only approximately with its low frequenchy sampling) 232 | * **Thread-safe** (unlike gprof which produces garbage call counts with multithreaded programs) 233 | * **Small** (~300 LOC) and easy to port 234 | 235 | Finally, funcount **counts exactly the calls funtrace would trace** - nothing that's not traced is counted, and nothing that's traced is left uncounted. 236 | 237 | You enable funcount by passing `-DFUNTRACE_FUNCOUNT` on the command line (only `funtrace.cpp` and `funtrace_pg.S` need this -D, you don't really need to recompile the whole program), or by compiling & linking `funcount.cpp` and `funcount_pg.S` instead of `funtrace.cpp` and `funtrace_pg.S` into your program - whichever is easier in your build system. If the program runs much slower than with funtrace (which can be very slow if you instrument before inlining but otherwise is fairly fast), it must be multithreaded, with the threads running the same concurrently and fighting over the ownership of the cache lines containing the call counters maintained by funcount. You can compile with `-DFUNCOUNT_PAGE_TABLES=16` or whatever number to have each CPU core update its own copy of each call counter, getting more speed in exchange for space (not that much space - each page table is at worst the size of the executable sections, though on small machines this might matter.) 238 | 239 | At the end of the run, you will see the message: 240 | 241 | `function call count report saved to funcount.txt - decode with funcount2sym to get: call_count, dyn_addr, static_addr, num_bytes, bin_file, src_file:src_line, mangled_func_name` 242 | 243 | `funcount2sym funcount.txt` prints the columns described in the message to standard output; the most commonly interesting ones are highlighted in bold: 244 | 245 | * **`call_count` - the number of times the function was called** 246 | * `dyn_addr` - the dynamic address of the function as loaded into the process (eg what you'd see in `gdb`) 247 | * `static_addr` - the static address of the function in the binary file (what you'd see with `nm`) 248 | * `num_bytes` - the number of bytes making up the function, a proxy for how many instructions long it is 249 | * `bin_file` - the executable or shared library containing the function 250 | * **`src_file:src_line` - the source file & line where the function is defined**, separated by ":" 251 | * **`mangled_func_name` - the mangled function name**; you can pipe funcount2sym through `c++filt` to demangle it, though often you will want the mangled name 252 | 253 | You can sort this report with `sort -nr` and add reports from multiple runs together with `awk`. To exclude frequently called functions from tracing, you can use the `NOFUNTRACE` attribute (as in `void NOFUNTRACE myfunc()`); `#include "funtrace.h"` to access the macro. You can also use the `-funtrace-no-trace=file` flag implemented by `funtrace++`, and pass it a file with a list of _mangled_ function names. See also "Disabling and enabling tracing" below. This might be faster than opening every relevant source file and adding `NOFUNTRACE` to every excluded function definition, and it avoids issues where the function attribute doesn't exclude the function for whatever reason. 254 | 255 | The advantage of the NOFUNTRACE attribute, apart from being kept together with the function definition (so you know easily what's traced and what's not), is that the overhead is **fully** removed, whereas `-funtrace-no-trace=file` only removes most of the overhead - it removes the calls to the entry/exit hooks, but the code is still "scarred" by the code having been generated. This is a small fraction of the overhead but if lots and lots of functions are "scarred" this way, it can add up. 256 | 257 | If the source files aren't where the debug info says they are, and/or the executable or shared objects are not where they were when the process was running, you can use `substitute-path.json` in the current directory of `funcount2sym` same as with `funtrace2viz`, as described in the next section. 258 | 259 | # Decoding traces 260 | 261 | `funtrace2viz funtrace.raw out` will produce an `out.json`, `out.1.json`, `out.2.json` etc. per trace sample in the file. (The snapshot-saving functions only put one sample into a file; the `funtrace.raw` file appended to by SIGTRAP and its programmatic equivalent can contain multiple samples.) 262 | 263 | If funtrace2viz can't find some of the source files or binaries it needs, it will print warnings; you can make it find the files using a `substitute-path.json` in its current directory. This JSON file should contain an array of arrays of length 2, for example: 264 | 265 | ``` json 266 | [ 267 | ["/build/server/source-dir/","/home/user/source-dir/"], 268 | ["/deployment/machine/binary-dir/","/home/user/binary-dir/"], 269 | ] 270 | ``` 271 | For every path string, funtrace2viz iterates over every pair in the array, replacing every occurence of the first string with the second string in the pair. 272 | 273 | Command line flags: 274 | 275 | * `-r/--raw-timestamps`: report the raw timestamps, rather than defining the earliest timestamp in each sample as 0 and counting from there 276 | * `-e/--executable-file-info`: on top of a function's name, file & line, show the binary it's from and its static address 277 | * `-m/--max-event-age`: ignore events older than this age; this is most likely to be useful for SIGTRAP-type snapshots where you have very old events from mostly idle threads and they cause the GUI timeline to zoom out so much you can't see anything. You can guess what the age is in part by looking at the printouts of funtrace2viz which tells the time range of the events traced from each thread 278 | * `-e/--oldest-event-time`: like `--max-event-age` but with the threshold defined as a timestamp instead of age 279 | * `-t/--threads`: a comma-separated list of thread TIDs - threads outside this list are ignored (including for the purpose of interpreting `--max-event-age` - if you ignore the thread with the most recent event, then the most recent event from threads you didn't ignore becomes "the most recent event" for age calculation purposes.) This is also something that's mostly useful for SIGTRAP-type snapshots to exclude mostly idle threads 280 | * `-s/--samples`: a comma-separated list of sample indexes - samples outside this list are ignored. Useful for the multi-sample `funtrace.raw` file appended to by SIGTRAP 281 | * `-d/--dry`: useful for a very large multi-sample `funtrace.raw` file if you want to decide what samples to focus on; this prints the time ranges of the threads in each sample, but doesn't decode anything (decoding runs at a rate of about 1MB of binary data per second) 282 | 283 | # Compile-time & runtime configuration 284 | 285 | ## Controlling which functions are traced 286 | 287 | Control at function granularity is only available at build time, as follows: 288 | 289 | * **Compiler function attributes**: 290 | * `NOFUNTRACE` - a function attribute excluding a function from tracing (eg `void NOFUNTRACE func()` - this is the `__attribute__((...))` syntax of gcc/clang). 291 | * `DOFUNTRACE` - a function attribute forcing the inclusion of a function in tracing - currently only meaningful for XRay, which might otherwise exclude functions due to the `-fxray-instruction-threshold=N` flag 292 | * **Assembly filtering flags**: if you use the `funtrace++` wrapper around g++/clang++ in your build system (which you'd want to do solely to get the flags below), you get the option to filter compiler-generated assembly code to exclude some functions from tracing; this is convenient with foreign code (eg functions in standard or external library header files) as well as "to cast a wide net" based on function length a-la XRay's `-fxray-instruction-threshold=N` (_note that assembly filtering is not supported with XRay_): 293 | * `-funtrace-do-trace=file` - the file should contain a list of whitespace-separated mangled function names, these functions will NOT excluded from tracing 294 | * `-funtrace-no-trace=file` - the file should contain a list of whitespace-separated mangled function names, these functions WILL be excluded from tracing 295 | * `-funtrace-instr-thresh=N` - functions with less than N instructions will be excluded from tracing together with function calls inlined into them, UNLESS they have loops 296 | * `-funtrace-ignore-loops` - if -funtrace-instr-thresh=N was passed, functions with less than N instructions will be excluded from tracing together with function calls inlined into them, EVEN IF they have loops 297 | 298 | There are thus several ways to ask to include or exclude a function from tracing; what happens if they conflict? 299 | 300 | * NOFUNTRACE "always wins" (unless there's a compiler issue where it's ignored for whatever reason) - you can't trace a function successfully excluded with NOFUNTRACE 301 | * DOFUNTRACE currently only means the function will survive XRay filtering; it does nothing for other instrumentation methods, so the function might be exluded from tracing with these methods (eg by -finstrument-functions-after-inling or -finstrument-functions-exclude-file-list) 302 | * For functions which "survived exclusion by the compiler": 303 | * A function on the list passed to -funtrace-do-trace is always kept 304 | * Otherwise, a function on the list passed to -funtrace-no-trace is excluded, and so are function calls inlined into it 305 | * Otherwise, a function with less than N instructions where N was defined with -funtrace-instr-thresh=N and has no loops is excluded, and so are function calls inlined into it. If it has loops but -funtrace-ignore-loops was passed, it is also excluded, and so are function calls inlined into it. 306 | 307 | ## Disabling & enabling tracing 308 | 309 | * `funtrace_ignore_this_thread()` excludes the calling thread from tracing "forever" (there's currently no way to undo this) 310 | * `funtrace_disable_tracing()` disables tracing globally (note that taking a snapshot effectively does the same thing until the snapshot is ready) 311 | * `funtrace_enable_tracing()` (re-)enables the tracing globally (by default, tracing is on when the program starts so you needn't do it; "on by default" means you can get a trace from a core dump and from a live process with SIGTRAP without any tweaking to the program source) 312 | 313 | Additionally, compiling with -DFUNTRACE_FTRACE_EVENTS_IN_BUF=0 or setting $FUNTRACE_FTRACE_EVENTS_IN_BUF to 0 at runtime effectively disables ftrace scheduling event tracing, as mentioned again in the next section. 314 | 315 | ## Controlling buffer sizes & lifetimes 316 | 317 | * `funtrace_set_thread_log_buf_size(log_buf_size)` sets the trace buffer size of the calling thread to `pow(2, log_buf_size)`. Passing 0 (or a value smaller than log(size of 2 trace entries), so currently 5) is equivalent to calling `funtrace_ignore_this_thread()` 318 | * The following parameters can be controlled by passing `-DNAME=VALUE` to the compiler (the command line equivalent of `#define NAME VALUE`), and/or reconfigured at runtime by setting the environment variable `$NAME` to `VALUE`: 319 | * `FUNTRACE_LOG_BUF_SIZE`: each thread starts with a thread-local trace buffer of this size (the default is 20, meaning 1M bytes = 32K trace entries ~= 16K most recent function calls.) This initial buffer size can then be changed using `funtrace_set_thread_log_buf_size()` 320 | * `FUNTRACE_FTRACE_EVENTS_IN_BUF`: the number of entries in this process's userspace ftrace buffer (the default is 20000; the size in bytes can vary since each entry keeps one line of textual ftrace data.) Passing `-DFUNTRACE_FTRACE_EVENTS_IN_BUF=0` disables ftrace at compile time - this **cannot** be changed by setting the env var at runtime to a non-zero value. 321 | * `FUNTRACE_GC_MAX_AGE_MS`: when set to 0, a thread's thread-local trace buffer is freed upon thread exit - which means the trace data will be missing from future snapshots, even though the events in that buffer might have been recorded during the time range covered by the snapshot. When set to a non-zero value (default: 300 ms), thread trace buffers are kept after thread exit, and garbage-collected every FUNTRACE_GC_PERIOD_MS (see below); only buffers with age exceeding FUNTRACE_GC_MAX_AGE_MS are freed. Passing `-DFUNTRACE_GC_MAX_AGE_MS` disables garbage collection at compile time - this **cannot** be changed by setting the env var at runtime to a non-zero value. 322 | * `FUNTRACE_GC_PERIOD_MS`: unless compiled out by #defining FUNTRACE_GC_MAX_AGE_MS to 0, the thread trace buffer garbage collection runs every FUNTRACE_GC_PERIOD_MS ms (default: the compile-time value of FUNTRACE_GC_MAX_AGE_MS.) 323 | 324 | # Limitations 325 | 326 | * **Can't trace inside shared libraries unless they're loaded by an executable containing the funtrace runtime** - for example, a Python extension module written in C++ can't be traced, similarly to any other kind of plugin loaded by a program not compiled with funtrace. This is because of the TLS issue explained above. 327 | * **Thread creation/exit and saving a trace snapshot take the same lock** - this can slow things down; hopefully not too badly since saving a snapshot is pretty fast, and creating lots of threads at runtime (rather than reusing from a thread pool) should be rare 328 | * **ftrace / thread scheduling events might have issues near the snapshot time range boundaries**: 329 | * Perfetto might not render thread status very clearly near the boundaries even when it's clear from the ftrace log 330 | * There's a latency between a thread scheduling event and the moment it's delivered to funtrace's userspace thread collecting the events (we try to give this thread a high priority but will typically lack permissions to give it a real-time priority.) One way around this could be *a mechanism for "late delivery" of ftrace events into snapshots* - since most of the time, snapshots are written to the file system much later than they're captured, we could put ftrace events into those already-captured, but not-yet-written-out snapshots whose time range contains a given newly arrived event. Doable, but a bit of a hassle, could be done given demand. 331 | * **Threads which exited by the time a snapshot was taken might be invisble in the trace** - unless the thread trace GC parameters were tuned such that the trace buffer is still around when the snapshot is taken, as explained above 332 | * **Funcount misses constructor calls** - shouldn't matter for its goal of finding functions called so often that you want to exclude them from tracing to avoid the overhead 333 | * **Overlapping time ranges** should never happen but might in some cases. The Perfetto/Chromium JSON spec requires events' time ranges to be nested within each other or not overlap at all. funtrace2viz takes this requirement seriously (rather than breaking it on the currently seemingly correct theory that some ways of breaking it are actually supported.) So when funtrace2viz observes that 20 functions have just returned (by seeing that f which called 19 functions has just returned, perhaps because of a longjmp or an exception being caught), it produces 20 different timestamps apart by at least 1 ns, the smallest time unit in the JSON. Some of these made-up return timestamps might cause overlap with later function calls. 334 | * **Tail call artifacts** with some instrumentation methods, as documented in the section "Choosing compiler instrumentation" 335 | * **Untraced exception catcher artifacts** with some instrumentation methods, as documented in the section "Choosing compiler instrumentation." A related but likely extremely rare artifact you might see with these instrumentation methods is mixing recursion and exception handling where you have a recursive function that doesn't catch an exception at the innermost recursion level but then does catch it at another level - funtrace trace analysis will incorrectly assume the exception was caught at the innermost level (unless `gcc -finstrument-functions` was used, which calls the on-return hook when unwinding the stack and doesn't require guesswork at trace analysis time.) 336 | * **Unloading traced shared libraries within the time range of a snapshot is unsupported** - a trace snapshot contains an address space snapshot made at the end of the time range, so if a shared library was unloaded, functions traced from it will not be decodable in the trace; reusing the executable address space for new addresses will mess up decoding further. A need to dlclose libraries midway thru the tracing is probably extremely rare. 337 | * **Mixing instrumentation methods in the same build or process wasn't tested** and might not work for various reasons; this feels like a fairly esoteric need, but can almost certainly be made to work given demand. 338 | 339 | # Funtrace file format 340 | 341 | You don't need to know this format unless you want to generate or process `funtrace.raw` files, or extend funtrace for your needs. 342 | 343 | Funtrace data is binary, using little endian encoding for integers. It consists of "chunks" where each chunk has an 8-byte magic number, a 64-bit size integer, and then a sequence of data bytes of the length specified by the size integer. Here are the chunk types and the format of the data: 344 | 345 | * **`PROCMAPS`**: the content of `/proc/self/maps` can go here; only the start, end, offset and path fields are used, and only the executable segments are listed at this stage (funtrace uses `dl_iterate_phdr` rather than `/proc/self/maps` to speed up snapshotting), but readonly data segments might go here eventually, too, eg if we implement custom log messages with [delayed formatting](https://yosefk.com/blog/delayed-printf-for-real-time-logging.html). Only the start, end, offset and path fields are used; permissions and inode info are ignored. 346 | * **`FUNTRACE`**: an 8-byte chunk indicating the start of a snapshot, with an 8-byte frequency of the timestamp counter, used to convert counter values into nanoseconds. A snapshot is interpreted according to the memory map reported by the last encountered `PROCMAPS` chunk (there may be many snapshots in the same file; currently the funtrace runtime saves a `PROCMAPS` chunk every time it takes a snapshot but if you know that your memory map remains stable over time and you want to shave off a little bit of latency, you could tweak this.) 347 | * **`CMD LINE`**: the process command line, used as the process name when generating the JSON. A wart worth mentioning is that currently, the funtrace runtime reads this from `/proc/self/cmdline` and replaces null characters separating the arguments with spaces, which means that the shell command `prog "aaa bbb"`, which passes a single string argument `aaa bbb`, will be saved as `prog aaa bbb` (two string arguments). So we save enough to help you see "the trace of what you're looking at" but not enough to eg use the saved command line for reproducing the run. 348 | * **`THREADID`**: a 64b PID integer, a 64b TID integer, and a null-terminated 16-byte name string (the content of `/proc/self/comm` aka the output of `pthread_getname_np(pthread_self(),...)`.) This precedes every `TRACEBUF` chunk (documented next.) 349 | * **`TRACEBUF`**: a variable sized chunk of length which is a multiple of 16. It contains trace entries; each entry is a 64b code pointer, and a 64b timestamp counter value. The entries are _not_ sorted by the timestamp, for 2 reasons - they come from a cyclic buffer, and the funtrace writeout code is racy, so you can have rare cases of `new_entry, old_entry, new_entry` near the end of the cyclic buffer because one of the newest entries didn't make it into the buffer so you got a much older entry. So you need to sort the entries for processing, and you need to "defend" against missing events (meaning, you could see a return without a call or a call without a return; this is not just because of the raciness of the writeout but because the cyclic buffer ends before "the end of program execution" and starts after "the start of execution" and you can have various other niceties like longjmp.) The code pointer can have the following flags set in its high bits: 350 | * `RETURN` (63): a return event, where the code pointer points into the returning function 351 | * `RETURN_WITH_CALLER_ADDRESS` (62): a return event where the code pointer points _into the function we're returning to_. This unfortunate tracing artifact happens under XRay instrumentation; funtrace2viz mostly recovers the flow despite this. When this bit and the previous bit are both set, this is a `CATCH` event, and the code pointer points into the function that caught the exception. 352 | * `CALL_RETURNING_UPON_THROW` (61): marks call events that will have a return event logged for them if an exception is thrown. Under most instrumentation methods this does not happen and so funtrace2viz guesses which functions effectively returned during stack unwinding. When it sees a call entry with this flag set, it knows that this function wouldn't return without logging a return event even if an exception was thrown, which prevents it from wrongly guessing that the function returned due to unwinding. 353 | * **`FTRACETX`**: a variable-sized chunk containing textual ftrace data (one event per line - what you read from `/sys/kernel/tracing/trace_pipe`). The timestamps in this data and the trace entries from `TRACEBUF` are from the same time source. 354 | * **`ENDTRACE`**: an zero-sized chunk marking the end of a snapshot. 355 | --------------------------------------------------------------------------------