├── tests
    ├── no-trace-bench.txt
    ├── no-trace.txt
    ├── do-trace.txt
    ├── untraced_catcher.cpp
    ├── asm_filter_2.cpp
    ├── freq.cpp
    ├── lib_shared.cpp
    ├── lib_dyn_shared.cpp
    ├── sigtrap.cpp
    ├── tailcall.cpp
    ├── c.c
    ├── test.h
    ├── untraced_funcs.cpp
    ├── count_shared.cpp
    ├── count_dyn_shared.cpp
    ├── shared.cpp
    ├── killed.cpp
    ├── buf_size.cpp
    ├── ftrace.cpp
    ├── asm_filter.cpp
    ├── count.cpp
    ├── ignore_disable.cpp
    ├── longjmp.cpp
    ├── exceptions.cpp
    ├── benchmark.cpp
    └── orphans.cpp
├── .gitignore
├── images
    └── krita-trace.png
├── Cargo.toml
├── funcount2sym
    ├── Cargo.toml
    └── src
    │   └── main.rs
├── procaddr2sym
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── simple-example
    ├── shared.cpp
    ├── run.sh
    ├── test.cpp
    └── build.sh
├── release.sh
├── funtrace2viz
    ├── Cargo.toml
    └── src
    │   └── main.rs
├── funtrace.dyn
├── compiler-wrappers
    ├── xray
    │   └── ld
    ├── funtrace-finstr-clang++
    ├── funtrace-xray-clang++
    ├── funtrace-pg-g++
    ├── funtrace-finstr-g++
    └── funtrace++
├── LICENSE.txt
├── funtrace_flags.h
├── fun_xray_so.S
├── funcount_pg.S
├── funtrace.h
├── funtrace_gdb.py
├── funtrace_pg.S
├── funcount.cpp
├── tests.py
└── README.md


/tests/no-trace-bench.txt:
--------------------------------------------------------------------------------
1 | trace_filtered
2 | 


--------------------------------------------------------------------------------
/tests/no-trace.txt:
--------------------------------------------------------------------------------
1 | _Z20long_but_blacklistedv
2 | 


--------------------------------------------------------------------------------
/tests/do-trace.txt:
--------------------------------------------------------------------------------
1 | _Z21short_but_whitelistedv
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Cargo.lock
2 | target/
3 | built-tests/*
4 | out/*
5 | .*.swp
6 | 


--------------------------------------------------------------------------------
/tests/untraced_catcher.cpp:
--------------------------------------------------------------------------------
1 | #define UNTRACED_CATCHER
2 | #include "exceptions.cpp"
3 | 


--------------------------------------------------------------------------------
/images/krita-trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yosefk/funtrace/HEAD/images/krita-trace.png


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = ["procaddr2sym", "funcount2sym", "funtrace2viz"]
3 | resolver = "2"
4 | 


--------------------------------------------------------------------------------
/tests/asm_filter_2.cpp:
--------------------------------------------------------------------------------
1 | //the same test as asm_filter but compiled with different -funtrace-* flags - we want to see that we get a different trace that way
2 | #include "asm_filter.cpp"
3 | 


--------------------------------------------------------------------------------
/funcount2sym/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "funcount2sym"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | procaddr2sym = { path = "../procaddr2sym" }
10 | 


--------------------------------------------------------------------------------
/procaddr2sym/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "procaddr2sym"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | addr2line = "0.20"
 8 | chrono = "0.4.39"
 9 | cpp_demangle = "0.4.4"
10 | goblin = "0.9.2"
11 | memmap2 = "0.9.5"
12 | procfs = "0.17.0"
13 | serde_json = "1.0.134"
14 | 


--------------------------------------------------------------------------------
/tests/freq.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | #include <unistd.h>
 3 | 
 4 | volatile int n=0;
 5 | 
 6 | void NI usleep_1500()
 7 | {
 8 |     usleep(1500);
 9 |     n++;
10 | }
11 | 
12 | int main()
13 | {
14 |     //test that we convert TSC to us correctly
15 |     scope_tracer tracer;
16 |     usleep_1500();
17 | }
18 | 


--------------------------------------------------------------------------------
/simple-example/shared.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | volatile int glob;
 3 | 
 4 | void __attribute__((noinline)) shared_f(int n)
 5 | {
 6 |     glob = n;
 7 | }
 8 | 
 9 | void __attribute__((noinline)) shared_g(int a1, int a2, int a3, int a4, int a5, int a6)
10 | {
11 |     shared_f(a1+a2+a3+a4+a5+a6);
12 |     shared_f(a1*a2*a3*a4*a5*a6);
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/lib_shared.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int shared_n;
 4 | 
 5 | void NI f_shared()
 6 | {
 7 |     shared_n++;
 8 | }
 9 | 
10 | void NI g_shared()
11 | {
12 |     f_shared();
13 |     f_shared();
14 |     shared_n++;
15 | }
16 | 
17 | void NI h_shared()
18 | {
19 |     g_shared();
20 |     f_shared();
21 |     shared_n++;
22 | }
23 | 


--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -ex
3 | cd ..
4 | rm -f funtrace/funtrace.zip
5 | zip funtrace/funtrace.zip funtrace/README.md funtrace/funtrace.cpp funtrace/funcount.cpp funtrace/funtrace.h funtrace/funtrace_flags.h funtrace/*.S funtrace/funtrace.dyn \
6 |     funtrace/target/x86_64-unknown-linux-gnu/release/{funcount2sym,funtrace2viz} funtrace/compiler-wrappers/* funtrace/compiler-wrappers/xray/* funtrace/simple-example/*
7 | 


--------------------------------------------------------------------------------
/funtrace2viz/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "funtrace2viz"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | procaddr2sym = { path = "../procaddr2sym" }
10 | bytemuck = { version="1.20.0", features = ["derive"] }
11 | serde_json = "1.0.133"
12 | clap = { version = "3.0", features = ["derive"] }
13 | num = "0.4.3"
14 | 


--------------------------------------------------------------------------------
/funtrace.dyn:
--------------------------------------------------------------------------------
 1 | {
 2 |   funtrace_pause_and_write_current_snapshot;
 3 |   funtrace_pause_and_get_snapshot;
 4 |   funtrace_time;
 5 |   funtrace_ticks_per_second;
 6 |   funtrace_pause_and_get_snapshot_starting_at_time;
 7 |   funtrace_pause_and_get_snapshot_up_to_age;
 8 |   funtrace_free_snapshot;
 9 |   funtrace_write_snapshot;
10 |   funtrace_ignore_this_thread;
11 |   funtrace_set_thread_log_buf_size;
12 |   funtrace_disable_tracing;
13 |   funtrace_enable_tracing;
14 | };
15 | 


--------------------------------------------------------------------------------
/tests/lib_dyn_shared.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int dyn_shared_n;
 4 | 
 5 | void NI f_dyn_shared()
 6 | {
 7 |     dyn_shared_n++;
 8 | }
 9 | 
10 | void NI g_dyn_shared()
11 | {
12 |     f_dyn_shared();
13 |     f_dyn_shared();
14 |     dyn_shared_n++;
15 | }
16 | 
17 | void NI h_dyn_shared()
18 | {
19 |     g_dyn_shared();
20 |     f_dyn_shared();
21 |     dyn_shared_n++;
22 | }
23 | 
24 | extern "C" void NI h_dyn_shared_c()
25 | {
26 |     h_dyn_shared();
27 |     dyn_shared_n++;
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/sigtrap.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | #include <thread>
 3 | #include <unistd.h>
 4 | #include <csignal>
 5 | 
 6 | volatile int n;
 7 | 
 8 | void NI traced_func()
 9 | {
10 |     n++;
11 | }
12 | 
13 | void NI traced_thread()
14 | {
15 |     while(true) {
16 |         traced_func();
17 |         n++;
18 |     }
19 | }
20 | 
21 | int main()
22 | {
23 |     funtrace_ignore_this_thread();
24 | 
25 |     std::thread t(traced_thread);
26 |     t.detach();
27 | 
28 |     while(n < 100);
29 | 
30 |     kill(getpid(), SIGTRAP);
31 | }
32 | 


--------------------------------------------------------------------------------
/tests/tailcall.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int n;
 4 | 
 5 | void NI callee()
 6 | {
 7 |     n++;
 8 | }; 
 9 | 
10 | void NI tail_caller()
11 | {
12 |     n++;
13 |     callee();
14 | }
15 | 
16 | void NI NOFUNTRACE callee_untraced()
17 | {
18 |     n++;
19 | }
20 | 
21 | void NI tail_caller_untraced()
22 | {
23 |     n++;
24 |     callee_untraced();
25 | }
26 | 
27 | int main()
28 | {
29 |     scope_tracer tracer;
30 |     for(int i=0; i<3; ++i) {
31 |         tail_caller();
32 |         tail_caller_untraced();
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/tests/c.c:
--------------------------------------------------------------------------------
 1 | #include "funtrace.h"
 2 | 
 3 | #define NI __attribute__((noinline))
 4 | 
 5 | volatile int n;
 6 | 
 7 | void NI f()
 8 | {
 9 |     n++;
10 | }
11 | 
12 | void NI g()
13 | {
14 |     f();
15 |     n++;
16 |     f();
17 |     n++;
18 | }
19 | 
20 | int main()
21 | {
22 |     uint64_t start = funtrace_time();
23 | 
24 |     g();
25 | 
26 |     funtrace_snapshot* snapshot = funtrace_pause_and_get_snapshot_starting_at_time(start);
27 |     funtrace_write_snapshot("funtrace.raw", snapshot);
28 |     funtrace_free_snapshot(snapshot);
29 | }
30 | 


--------------------------------------------------------------------------------
/tests/test.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "funtrace.h"
 4 | 
 5 | #define NI __attribute__((noinline))
 6 | 
 7 | struct scope_tracer
 8 | {
 9 |     uint64_t start_time = 0;
10 |     const char* fname = nullptr;
11 | 
12 |     NOFUNTRACE scope_tracer(const char* f="funtrace.raw")
13 |     {
14 |         fname = f;
15 |         start_time = funtrace_time();
16 |     }
17 | 
18 |     NOFUNTRACE ~scope_tracer()
19 |     {
20 |         funtrace_snapshot* snapshot = funtrace_pause_and_get_snapshot_starting_at_time(start_time);
21 |         funtrace_write_snapshot(fname, snapshot);
22 |         funtrace_free_snapshot(snapshot);
23 |     };
24 | };
25 | 


--------------------------------------------------------------------------------
/tests/untraced_funcs.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int n;
 4 | 
 5 | inline void NOFUNTRACE nop() {}
 6 | 
 7 | #define UNTRACED(name, callee) void NI NOFUNTRACE name() { n++; callee(); n++; }
 8 | #define TRACED(name, callee) void NI name() { n++; callee(); n++; }
 9 | 
10 | UNTRACED(un1, nop);
11 | TRACED(tr1, un1);
12 | UNTRACED(un2, tr1);
13 | TRACED(tr2, un2);
14 | 
15 | UNTRACED(un3, nop);
16 | UNTRACED(un4, un3);
17 | TRACED(tr3, un4);
18 | TRACED(tr4, tr3);
19 | UNTRACED(un5, tr4);
20 | UNTRACED(un6, un5);
21 | 
22 | int main()
23 | {
24 |     scope_tracer tracer;
25 | 
26 |     tr2();
27 |     un2();
28 | 
29 |     un6();
30 |     tr4();
31 | }
32 | 


--------------------------------------------------------------------------------
/tests/count_shared.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int shared_n;
 4 | 
 5 | //we want the libraries to be loaded far apart to make sure
 6 | //funcount actually finds the newly mapped executable segments
 7 | //as opposed to "being lucky" with them mapped where it already
 8 | //has pages in its page table
 9 | char buf[256*1024]={1};
10 | 
11 | struct glob
12 | {
13 |     glob() { shared_n++; }
14 | } gg;
15 | 
16 | void NI f_shared()
17 | {
18 |     shared_n++;
19 | }
20 | 
21 | void NI g_shared()
22 | {
23 |     f_shared();
24 |     shared_n++;
25 |     f_shared();
26 | }
27 | 
28 | void NI h_shared()
29 | {
30 |     g_shared();
31 |     shared_n++;
32 |     f_shared();
33 | }
34 | 


--------------------------------------------------------------------------------
/simple-example/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -ex
 4 | 
 5 | ./out/test_trace.fi-gcc
 6 | ./target/x86_64-unknown-linux-gnu/release/funtrace2viz funtrace.raw out/funtrace-fi-gcc
 7 | rm funtrace.raw
 8 | 
 9 | ./out/test_trace.pg
10 | ./target/x86_64-unknown-linux-gnu/release/funtrace2viz funtrace.raw out/funtrace-pg
11 | rm funtrace.raw
12 | 
13 | ./out/test_trace.fi-clang
14 | ./target/x86_64-unknown-linux-gnu/release/funtrace2viz funtrace.raw out/funtrace-fi-clang
15 | rm funtrace.raw
16 | 
17 | if [ -e ./out/test_trace.xray ]; then
18 | 	env XRAY_OPTIONS="patch_premain=true" ./out/test_trace.xray
19 | 	./target/x86_64-unknown-linux-gnu/release/funtrace2viz funtrace.raw out/funtrace-xray
20 | 	rm funtrace.raw
21 | fi
22 | 


--------------------------------------------------------------------------------
/tests/count_dyn_shared.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int dyn_shared_n;
 4 | 
 5 | //we want the libraries to be loaded far apart to make sure
 6 | //funcount actually finds the newly mapped executable segments
 7 | //as opposed to "being lucky" with them mapped where it already
 8 | //has pages in its page table
 9 | char buf_shared[256*1024]={1};
10 | 
11 | struct glob_dyn
12 | {
13 |     glob_dyn() { dyn_shared_n++; }
14 | } gg_dyn;
15 | 
16 | void NI f_dyn_shared()
17 | {
18 |     dyn_shared_n++;
19 | }
20 | 
21 | void NI g_dyn_shared()
22 | {
23 |     f_dyn_shared();
24 |     dyn_shared_n++;
25 |     f_dyn_shared();
26 | }
27 | 
28 | void NI h_dyn_shared()
29 | {
30 |     g_dyn_shared();
31 |     dyn_shared_n++;
32 |     f_dyn_shared();
33 | }
34 | 
35 | extern "C" void NI h_dyn_shared_c()
36 | {
37 |     h_dyn_shared();
38 | }
39 | 


--------------------------------------------------------------------------------
/simple-example/test.cpp:
--------------------------------------------------------------------------------
 1 | #include "funtrace.h"
 2 | #include <pthread.h>
 3 | #include <thread>
 4 | 
 5 | #define NL __attribute__((noinline))
 6 | 
 7 | volatile int n;
 8 | 
 9 | NL void f(int i)
10 | {
11 |     n = i;
12 | }
13 | 
14 | void NL g(int i)
15 | {
16 |     f(i);
17 | }
18 | 
19 | void NL h(int i) {
20 |     g(i);
21 |     g(i);
22 | }
23 | 
24 | volatile int done = 0;
25 | 
26 | void shared_g(int a1, int a2, int a3, int a4, int a5, int a6);
27 | 
28 | int main()
29 | {
30 |     std::thread t([]{
31 |             pthread_setname_np(pthread_self(), "child");
32 |             for(int i=0; i<100000; ++i) {
33 |                 h(1);
34 |             }
35 |     });
36 |     for(int i=0; i<100000; ++i) {
37 |         g(2);
38 |         shared_g(1,2,3,4,5,6);
39 |     }
40 |     t.join();
41 | 
42 |     funtrace_pause_and_write_current_snapshot();
43 | }
44 | 


--------------------------------------------------------------------------------
/compiler-wrappers/xray/ld:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import sys
 3 | import os
 4 | import subprocess
 5 | args = sys.argv[1:]
 6 | 
 7 | # this linker wrapper lets us override XRay's functions like __xray_FunctionEntry;
 8 | # for this it puts the XRay runtime libraries after our runtime code, and passes
 9 | # --allow-multiple-definition
10 | 
11 | start = None
12 | for i,arg in enumerate(args):
13 |     if arg == '--whole-archive':
14 |         start = i
15 |     elif arg == '--no-whole-archive' and start is not None:
16 |         end = i+1
17 |         break
18 |     elif 'xray' not in arg:
19 |         start = None
20 | 
21 | if start is not None:
22 |     xraylibs = args[start:end]
23 |     args = args[:start] + args[end:]
24 |     stdlibspos = args.index('-lc')
25 |     args = args[:stdlibspos] + xraylibs + args[stdlibspos:]
26 | 
27 | args += ['--allow-multiple-definition']
28 | 
29 | ldpath = subprocess.getoutput('which ld')
30 | os.execl(ldpath, *([ldpath] + args))
31 | 


--------------------------------------------------------------------------------
/tests/shared.cpp:
--------------------------------------------------------------------------------
 1 | #include <thread>
 2 | #include <dlfcn.h>
 3 | #include <string>
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | #include <unistd.h>
 7 | #include "test.h"
 8 | 
 9 | volatile int n;
10 | 
11 | void NI f()
12 | {
13 |     n++;
14 | }
15 | 
16 | void NI g()
17 | {
18 |     f();
19 |     f();
20 |     n++;
21 | }
22 | 
23 | void NI h()
24 | {
25 |     g();
26 |     f();
27 |     n++;
28 | }
29 | 
30 | void h_shared();
31 | void (*h_shared_2)();
32 | 
33 | const int64_t iters = 3;
34 | 
35 | void NI loop()
36 | {
37 |     for(int64_t i=0; i<iters; ++i) {
38 |         h();
39 |         h_shared();
40 |         h_shared_2();
41 |         n++;
42 |     }
43 | }
44 | 
45 | int main()
46 | {
47 |     scope_tracer tracer;
48 | 
49 |     void* lib = dlopen(LIBS, RTLD_NOW);
50 |     h_shared_2 = (void (*)())dlsym(lib, "h_dyn_shared_c");
51 | 
52 |     std::thread t(loop);
53 |     std::thread t2(loop);
54 |     loop();
55 |     t.join();
56 |     t2.join();
57 | }
58 | 


--------------------------------------------------------------------------------
/tests/killed.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | #include <unistd.h>
 3 | #include <cstdlib>
 4 | #include <thread>
 5 | #include <pthread.h>
 6 | 
 7 | volatile int n;
 8 | 
 9 | void NI f()
10 | {
11 |     n++;
12 | }
13 | 
14 | void NI g()
15 | {
16 |     f();
17 |     n++;
18 | }
19 | 
20 | void NOFUNTRACE child_inf()
21 | {
22 |     g();
23 |     while(1);
24 | }
25 | 
26 | void NOFUNTRACE child_fin()
27 | {
28 |     pthread_setname_np(pthread_self(), "child");
29 |     g();
30 |     usleep(150*1000); //to get ftrace events
31 |     for(volatile int i=0; i<1000000000; ++i);
32 | }
33 | 
34 | int main()
35 | {
36 |     {
37 |         scope_tracer empty;
38 |         //just so funtrace.raw is created
39 |     }
40 |     g();
41 | 
42 |     std::thread t1(child_inf);
43 |     std::thread t2(child_fin);
44 | 
45 |     t2.join();
46 | 
47 |     //this will leave an ftrace tracer instance that we want some other run
48 |     //of a funtrace-instrumented program to collect
49 |     abort();
50 | }
51 | 


--------------------------------------------------------------------------------
/tests/buf_size.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | #include <thread>
 3 | #include <pthread.h>
 4 | 
 5 | volatile int n = 0;
 6 | 
 7 | void NI f()
 8 | {
 9 |     n++;
10 | }
11 | 
12 | int main()
13 | {
14 |     scope_tracer tracer;
15 | 
16 |     //this incidentally tests garbage collection (the thread dies by the time
17 |     //the scope tracer is destroyed and we check that we get both threads' traces)
18 |     //in addition to checking that we can set per-thread buffer sizes
19 |     std::thread t([] {
20 |         funtrace_set_thread_log_buf_size(5+4);
21 |         pthread_setname_np(pthread_self(), "event_buf_16");
22 |         //check that only 16 function calls out of these 100 are logged into the small buffer
23 |         for(int i=0; i<100; ++i) {
24 |             f();
25 |         }
26 |     });
27 | 
28 |     funtrace_set_thread_log_buf_size(5);
29 |     pthread_setname_np(pthread_self(), "event_buf_1");
30 |     //check that only one function call out of these 100 is logged into the small buffer
31 |     for(int i=0; i<100; ++i) {
32 |         f();
33 |     }
34 |     t.join();
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/ftrace.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | #include <pthread.h>
 3 | #include <unistd.h>
 4 | #include <thread>
 5 | 
 6 | void NI spin()
 7 | {
 8 |     volatile int n=0;
 9 |     for(n=0; n<100000000; ++n);
10 | }
11 | 
12 | volatile int n = 0;
13 | 
14 | void NI sleep()
15 | {
16 |     usleep(150*1000);
17 |     n++;
18 | }
19 | 
20 | void NI child()
21 | {
22 |     pthread_setname_np(pthread_self(), "child");
23 |     spin();
24 |     sleep();
25 |     spin();
26 | }
27 | 
28 | void NI parent()
29 | {
30 |     spin();
31 |     sleep();
32 |     spin();
33 | }
34 | 
35 | int main()
36 | {
37 |     //the trouble with ftrace is that there's no guarantee on event
38 |     //delivery latency from the kernel to the userspace, so when you
39 |     //take a snapshot, you might be missing some events; our sleeping
40 |     //and busy loops are hopefully long enough for events to be consistently
41 |     //observed when testing
42 |     scope_tracer tracer;
43 | 
44 |     pthread_setname_np(pthread_self(), "parent");
45 | 
46 |     std::thread t(child);
47 |     parent();
48 | 
49 |     t.join();
50 | }
51 | 


--------------------------------------------------------------------------------
/tests/asm_filter.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int n;
 4 | 
 5 | void NI short_function()
 6 | {
 7 |     n++;
 8 | }
 9 | 
10 | void NI short_but_whitelisted()
11 | {
12 |     n++;
13 | }
14 | 
15 | void NI long_enough_function()
16 | {
17 |     short_function();
18 |     n++;
19 |     short_function();
20 |     n++;
21 |     short_function();
22 |     n++;
23 |     short_function();
24 |     n++;
25 |     short_function();
26 |     n++;
27 |     short_function();
28 |     n++;
29 |     short_function();
30 |     n++;
31 |     short_function();
32 |     n++;
33 |     short_function();
34 |     n++;
35 |     short_function();
36 |     n++;
37 |     short_function();
38 | }
39 | 
40 | void NI long_but_blacklisted()
41 | {
42 |     short_function();
43 |     n++;
44 |     short_function();
45 |     n++;
46 |     short_function();
47 | }
48 | 
49 | void NI short_with_loop()
50 | {
51 |     while(!n);
52 | }
53 | 
54 | int main()
55 | {
56 |     scope_tracer tracer;
57 | 
58 |     short_function();
59 |     short_but_whitelisted();
60 |     long_enough_function();
61 |     long_but_blacklisted();
62 |     short_with_loop();
63 | }
64 | 


--------------------------------------------------------------------------------
/tests/count.cpp:
--------------------------------------------------------------------------------
 1 | #include <thread>
 2 | #include <dlfcn.h>
 3 | #include <string>
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | #include <unistd.h>
 7 | #include "test.h"
 8 | 
 9 | volatile int n;
10 | 
11 | void NI f()
12 | {
13 |     n++;
14 | }
15 | 
16 | void NI g()
17 | {
18 |     f();
19 |     n++;
20 |     f();
21 | }
22 | 
23 | void NI h()
24 | {
25 |     g();
26 |     n++;
27 |     f();
28 | }
29 | 
30 | void h_shared();
31 | void (*h_shared_2)();
32 | 
33 | const int64_t iters = 1000;
34 | 
35 | void loop()
36 | {
37 |     for(int64_t i=0; i<iters; ++i) {
38 |         h();
39 |         h_shared();
40 |         h_shared_2();
41 |     }
42 | }
43 | 
44 | int main()
45 | {
46 |     scope_tracer tracer; //we won't actually get a trace but this tests
47 |     //that the program will link with the funcount runtime when using
48 |     //funtrace APIs, though these won't do anything in the funcount build
49 | 
50 |     void* lib = dlopen(LIBS, RTLD_NOW);
51 |     h_shared_2 = (void (*)())dlsym(lib, "h_dyn_shared_c");
52 | 
53 |     std::thread t([] {
54 |         loop();
55 |     });
56 |     std::thread t2([] {
57 |         loop();
58 |     });
59 |     loop();
60 |     t.join();
61 |     t2.join();
62 | }
63 | 


--------------------------------------------------------------------------------
/tests/ignore_disable.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | #include <thread>
 3 | 
 4 | volatile int n = 0;
 5 | 
 6 | void NI should_be_traced() { n++; }
 7 | //shouldn't be traced since it's called from an ignored thread
 8 | void NI shouldnt_be_traced() { n++; }
 9 | 
10 | const char* g_child_name = "none";
11 | 
12 | void NI traced_thread()
13 | {
14 |     n++;
15 |     pthread_setname_np(pthread_self(), g_child_name);
16 |     should_be_traced();
17 |     n++;
18 | }
19 | 
20 | void NI ignored_thread()
21 | {
22 |     n++;
23 |     shouldnt_be_traced();
24 |     n++;
25 |     funtrace_ignore_this_thread();
26 |     shouldnt_be_traced();
27 |     n++;
28 | }
29 | 
30 | void run_threads()
31 | {
32 |     std::thread t1(traced_thread);
33 |     std::thread t2(ignored_thread);
34 |     should_be_traced();
35 |     t1.join();
36 |     t2.join();
37 | }
38 | 
39 | int main()
40 | {
41 |     pthread_setname_np(pthread_self(), "main");
42 |     scope_tracer tracer;
43 | 
44 |     g_child_name = "child1";
45 |     run_threads();
46 | 
47 |     funtrace_disable_tracing();
48 |     g_child_name = "child2";
49 |     run_threads();
50 | 
51 |     funtrace_enable_tracing();
52 |     g_child_name = "child3";
53 |     run_threads();
54 | }
55 | 


--------------------------------------------------------------------------------
/compiler-wrappers/funtrace-finstr-clang++:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import sys
 3 | import os
 4 | import subprocess
 5 | linking = '-c' not in sys.argv and '-S' not in sys.argv and '-E' not in sys.argv
 6 | args = sys.argv[1:]
 7 | 
 8 | # clang doesn't have -finstrument-functions-exclude-file-list so you can't exclude header files
 9 | # easily; it does however have -finstrument-functions-after-inlining which is a good default
10 | # in general and especially in the absence of -finstrument-functions-exclude-file-list.
11 | # you can use -finstrument-functions instead if needed
12 | args += "-g -pthread -finstrument-functions-after-inlining".split()
13 | 
14 | srcdir = os.path.dirname(os.path.dirname(__file__))
15 | if linking:
16 |     is_shared = '-shared' in args
17 |     if not is_shared: # don't link the runtime into .so's - only into the executables using them
18 |         args += [os.path.join(srcdir, f) for f in ['funtrace.cpp']] + [f'-Wl,--dynamic-list={srcdir}/funtrace.dyn']
19 |     else:
20 |         # remove no-undefined for access to the funtrace_* runtime functions
21 |         args = [a for a in args if a != '-Wl,--no-undefined']
22 |     args += ['-ldl']
23 | 
24 | funtracexx = os.path.join(srcdir, 'compiler-wrappers/funtrace++')
25 | os.execl(funtracexx, *([funtracexx, 'clang++'] + args))
26 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024-2025, Yossi Kreinin
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met: 
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer. 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution. 
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 


--------------------------------------------------------------------------------
/tests/longjmp.cpp:
--------------------------------------------------------------------------------
 1 | //we use longjmp just as an example of something that breaks the assumption
 2 | //that you get a return-from-function event eventually after it was called -
 3 | //instead you have here a bunch of functions that are called and never returned
 4 | //from. we use this to test the ability of funtrace2viz to (somewhat) recover from such
 5 | //scenarios, of which the use of longjmp is one potential cause [which we could
 6 | //try to eliminate by interposing longjmp but it doesn't seem popular enough to
 7 | //bother and there are probably others]
 8 | 
 9 | #include <csetjmp>
10 | #include "test.h"
11 | 
12 | volatile int n;
13 | jmp_buf jmpbuf;
14 | 
15 | void NI jumper()
16 | {
17 |     n++;
18 |     longjmp(jmpbuf, 1);
19 | }
20 | 
21 | void NI wrapper_call()
22 | {
23 |     n++;
24 |     jumper();
25 |     n++;
26 | }
27 | 
28 | void NI wrapper_call_outer()
29 | {
30 |     n++;
31 |     wrapper_call();
32 |     n++;
33 | }
34 | 
35 | void NI before_setjmp()
36 | {
37 |     n++;
38 | }
39 | 
40 | void NI after_longjmp()
41 | {
42 |     n++;
43 | }
44 | 
45 | void NI setter()
46 | {
47 |     n++;
48 |     before_setjmp();
49 |     if(setjmp(jmpbuf)) {
50 |         after_longjmp();
51 |     }
52 |     else {
53 |         wrapper_call_outer();
54 |     }
55 | }
56 | 
57 | int main()
58 | {
59 |     scope_tracer tracer;
60 |     for(int i=0; i<3; ++i) {
61 |         setter();
62 |     }
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/compiler-wrappers/funtrace-xray-clang++:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import sys
 3 | import os
 4 | import subprocess
 5 | linking = '-c' not in sys.argv and '-S' not in sys.argv and '-E' not in sys.argv
 6 | args = sys.argv[1:]
 7 | 
 8 | # we use the default instruction threshold, pass -fxray-instruction-threshold=N to override
 9 | args += '-g -pthread -fxray-instrument'.split()
10 | if linking:
11 |     srcdir = os.path.dirname(os.path.dirname(__file__))
12 |     args += [f'-B{srcdir}/compiler-wrappers/xray'] # use our ld wrapper to override __xray_Function* handlers
13 | 
14 |     is_shared = '-shared' in args
15 |     if not is_shared: # don't link the runtime into .so's - only into the executables using them
16 |         args += [os.path.join(srcdir, f) for f in ['funtrace.cpp', 'funtrace_pg.S']] + [f'-Wl,--dynamic-list={srcdir}/funtrace.dyn']
17 |         args += ['-ldl']
18 |     else:
19 |         args += ['-fxray-shared','fun_xray_so.S'] # this requires a pretty new version of LLVM; older ones can't instrument inside shared objects
20 |         # remove no-undefined for access to the funtrace_* runtime functions
21 |         args = [a for a in args if a != '-Wl,--no-undefined']
22 | 
23 | # currently funtrace++ doesn't support filtering of XRay compiler output so we run clang++ directly
24 | clangpath = subprocess.getoutput('which clang++')
25 | os.execl(clangpath, *([clangpath] + [arg for arg in args if not arg.startswith('-funtrace')]))
26 | 


--------------------------------------------------------------------------------
/compiler-wrappers/funtrace-pg-g++:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import sys
 3 | import os
 4 | import subprocess
 5 | linking = '-c' not in sys.argv and '-S' not in sys.argv and '-E' not in sys.argv
 6 | args = sys.argv[1:]
 7 | 
 8 | args += '-g -pthread'.split()
 9 | srcdir = os.path.dirname(os.path.dirname(__file__))
10 | if linking:
11 |     is_shared = '-shared' in args
12 |     if not is_shared: # don't link the runtime into .so's - only into the executables using them
13 |         args += [os.path.join(srcdir, f) for f in ['funtrace.cpp', 'funtrace_pg.S']] + [f'-Wl,--dynamic-list={srcdir}/funtrace.dyn']
14 |     else:
15 |         # remove no-undefined - __return__ will be undefined in shared objects and so will the funtrace_* runtime functions
16 |         args = [a for a in args if a != '-Wl,--no-undefined']
17 |     args += ['-ldl']
18 |     # note that we don't pass -pg when linking (and therefore the -mfentry and -minstrument-return-call
19 |     # flags which do nothing without -pg.) this is to avoid the generation of gmon.out. the downside
20 |     # is that if .cpp files are passed to the linker (so compiling and linking in a single command),
21 |     # we won't instrument those files. proper build system integration is ofc better than this wrapper...
22 | else:
23 |     args += '-pg -mfentry -minstrument-return=call'.split()
24 | 
25 | funtracexx = os.path.join(srcdir, 'compiler-wrappers/funtrace++')
26 | os.execl(funtracexx, *([funtracexx, 'g++'] + args))
27 | 


--------------------------------------------------------------------------------
/compiler-wrappers/funtrace-finstr-g++:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import sys
 3 | import os
 4 | import subprocess
 5 | linking = '-c' not in sys.argv and '-S' not in sys.argv and '-E' not in sys.argv
 6 | args = sys.argv[1:]
 7 | 
 8 | # change -finstrument-functions-exclude-file-list to suit your needs - this is just a sensible default
 9 | # (ofc you don't need to use a compiler wrapper at all; this is just potentially easier than properly
10 | # integrating with the build system on first try. more so with -pg than with -finstrument-functions
11 | # because the latter will have problems with an "undefined __return__" function every time gcc is invoked
12 | # with -minstrument-return=call but without funtrace_pg.S...)
13 | args += "-g -pthread -finstrument-functions -finstrument-functions-exclude-file-list=.h,.hpp,/usr/include".split()
14 | 
15 | srcdir = os.path.dirname(os.path.dirname(__file__))
16 | if linking:
17 |     is_shared = '-shared' in args
18 |     if not is_shared: # don't link the runtime into .so's - only into the executables using them
19 |         args += [os.path.join(srcdir, f) for f in ['funtrace.cpp']] + [f'-Wl,--dynamic-list={srcdir}/funtrace.dyn']
20 |     else:
21 |         # remove no-undefined for access to the funtrace_* runtime functions
22 |         args = [a for a in args if a != '-Wl,--no-undefined']
23 |     args += ['-ldl']
24 | 
25 | funtracexx = os.path.join(srcdir, 'compiler-wrappers/funtrace++')
26 | os.execl(funtracexx, *([funtracexx, 'g++'] + args))
27 | 


--------------------------------------------------------------------------------
/funtrace_flags.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | //these definitions must be kept in sync with funtrace2viz's
 4 | #define FUNTRACE_RETURN_BIT 63 //normally, a return event logs the address of the returning function...
 5 | #define FUNTRACE_RETURN_WITH_CALLER_ADDRESS_BIT 62 //...except under XRay when it logs the returning function's caller's address
 6 | #define FUNTRACE_CATCH_MASK ((1ULL<<FUNTRACE_RETURN_BIT)|(1ULL<<FUNTRACE_RETURN_WITH_CALLER_ADDRESS_BIT)) //since an event can't
 7 | //be both things described by these 2 bits, we can reserve their combination to mean "catch event"
 8 | 
 9 | //under most kinds of instrumentation, we don't get a return event upon throw,
10 | //so we pop the call entries from the stack when a catch event is traced, until we find
11 | //the caller which recorded the catch event. unfortunately that caller might have been
12 | //uninstrumented, in which case we pop the entire stack (the least bad option in the
13 | //general case which might be improved upon slightly in some cases but not always.)
14 | //
15 | //since under gcc's -finstrument-functions, we actually _do_ get a return event upon throw,
16 | //we mark such call entries by setting the bit below; so that when the trace decoder
17 | //encounters such an entry when processing a catch event, it stops popping entries
18 | //from the stack (since that function would have recorded a return event had it
19 | //been returned from during stack unwinding.)
20 | #define FUNTRACE_CALL_RETURNING_UPON_THROW_BIT 61
21 | 


--------------------------------------------------------------------------------
/tests/exceptions.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int n=0;
 4 | 
 5 | void NI thrower()
 6 | {
 7 |     n++;
 8 |     //interestingly if this is conditional, the test will fail
 9 |     //because of tail calls which we will "return" from before throwing
10 |     //(and we could make the test pass by making it expect this ofc);
11 |     //with unconditional throwing the compiler (-pg/-xray) doesn't bother
12 |     //to generate any __return__/FunctionTailExit calls since it knows
13 |     //they will never return
14 |     throw "error";
15 | }
16 | 
17 | void NI wrapper_call()
18 | {
19 |     n++;
20 |     thrower();
21 |     n++;
22 | }
23 | 
24 | void NI wrapper_tailcall_1()
25 | {
26 |     n++;
27 |     wrapper_call();
28 | }
29 | 
30 | void NI wrapper_tailcall_2()
31 | {
32 |     n++;
33 |     wrapper_tailcall_1();
34 | }
35 | 
36 | void NI wrapper_call_outer()
37 | {
38 |     n++;
39 |     wrapper_tailcall_2();
40 |     n++;
41 | }
42 | 
43 | void NI before_try()
44 | {
45 |     n++;
46 | }
47 | 
48 | void NI after_catch()
49 | {
50 |     n++;
51 | }
52 | 
53 | #ifndef UNTRACED_CATCHER
54 | #define TRACE
55 | #else
56 | #define TRACE NOFUNTRACE
57 | #endif
58 | 
59 | void NI TRACE catcher()
60 | {
61 |     n++;
62 |     before_try();
63 |     try {
64 |         wrapper_call_outer();
65 |     }
66 |     catch(...) {
67 |         after_catch();
68 |     }
69 |     n++;
70 | }
71 | 
72 | void NI caller()
73 | {
74 |     n++;
75 |     catcher();
76 |     n++;
77 | }
78 | 
79 | int main()
80 | {
81 |     scope_tracer tracer;
82 |     for(int i=0; i<3; ++i) {
83 |         caller();
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/tests/benchmark.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | #include <cstdio>
 3 | 
 4 | volatile int n;
 5 | 
 6 | extern "C" void NI trace_filtered()
 7 | {
 8 |     n=0;
 9 | }
10 | 
11 | void NI NOFUNTRACE notrace()
12 | {
13 |     n=0;
14 | }
15 | 
16 | void NI withtrace()
17 | {
18 |     n=0;
19 | }
20 | 
21 | const int iter=1000000;
22 | 
23 | template<class F>
24 | inline uint64_t time(F f, const char* msg, uint64_t base=0)
25 | {
26 |     int n=(iter/8)*8;
27 |     auto start = funtrace_time();
28 |     for(int i=0; i<iter/8; ++i) {
29 |         f();
30 |         f();
31 |         f();
32 |         f();
33 |         f();
34 |         f();
35 |         f();
36 |         f();
37 |     }
38 |     auto finish = funtrace_time();
39 |     auto average = (finish-start)/n;
40 |     if(base) {
41 |         printf("%s: %ld cycles on average (%ld cycles of overhead)\n", msg, average, average-base);
42 |     }
43 |     else {
44 |         printf("%s: %ld cycles on average\n", msg, average);
45 |     }
46 |     return average;
47 | }
48 | 
49 | //this microbenchmark is of course not representative of performance impact in
50 | //real code; it gives a rough idea of the overhead of instrumentation and tracing.
51 | int main()
52 | {
53 |     uint64_t base_cost = time(notrace, "compiled without tracing");
54 |     time(trace_filtered, "compiled with tracing, removed by asm filtering", base_cost);
55 |     time(withtrace, "compiled with tracing, enabled at runtime", base_cost);
56 |     funtrace_disable_tracing();
57 |     time(withtrace, "compiled with tracing, disabled at runtime", base_cost);
58 |     funtrace_pause_and_write_current_snapshot();
59 | }
60 | 


--------------------------------------------------------------------------------
/tests/orphans.cpp:
--------------------------------------------------------------------------------
 1 | #include "test.h"
 2 | 
 3 | volatile int n;
 4 | volatile uint64_t start_time;
 5 | 
 6 | void NI called_and_returned()
 7 | {
 8 |     n++;
 9 | }
10 | 
11 | void NI orphan_call_2()
12 | {
13 |     n++;
14 |     called_and_returned();
15 |     funtrace_write_snapshot("funtrace.raw", funtrace_pause_and_get_snapshot_starting_at_time(start_time));
16 |     n++;
17 | }
18 | 
19 | void NI orphan_call_1()
20 | {
21 |     n++;
22 |     called_and_returned();
23 |     orphan_call_2();
24 |     n++;
25 | }
26 | 
27 | void NI orphan_return_3()
28 | {
29 |     n++;
30 |     start_time = funtrace_time();
31 |     //we deliberately don't call a function here since
32 |     //under XRay, this call is where the info on the identify of orphan_call_2
33 |     //would come from (under XRay we record the returning function's _caller's_
34 |     //return address, not the address of the returning function itself.)
35 |     //
36 |     //we also test funtrace2viz's ability to figure out the orphan's identify
37 |     //from a previous return to it when returning from orphan_return_1 (which
38 |     //does call functions, so its address gets recorded when they return)
39 |     //
40 |     //the call we don't have here:
41 |     //called_and_returned();
42 |     n++;
43 | }
44 | 
45 | void NI orphan_return_2()
46 | {
47 |     n++;
48 |     orphan_return_3();
49 |     n++;
50 | }
51 | 
52 | void NI orphan_return_1()
53 | {
54 |     n++;
55 |     orphan_return_2();
56 |     called_and_returned();
57 |     n++;
58 | }
59 | 
60 | void NI neither_call_nor_return_recorded()
61 | {
62 |     n++;
63 |     orphan_return_1();
64 |     called_and_returned();
65 |     orphan_call_1();
66 |     n++;
67 | }
68 | 
69 | int main()
70 | {
71 |     neither_call_nor_return_recorded();
72 | }
73 | 


--------------------------------------------------------------------------------
/simple-example/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -ex
 3 | 
 4 | cleanup() {
 5 | 	rm *.o
 6 | }
 7 | 
 8 | trap cleanup EXIT
 9 | 
10 | # building the offline conversion tools:
11 | ########################################
12 | if [ -e Cargo.toml ]; then
13 | 	RUSTFLAGS="-C target-feature=+crt-static" cargo build -r --target x86_64-unknown-linux-gnu
14 | fi
15 | 
16 | # building tests using the funtrace instrumentation & runtime:
17 | ##############################################################
18 | 
19 | CXXFLAGS="-O3 -std=c++11 -Wall -I."
20 | # the wrappers pass all the flags we need for tracing to be built into the output program;
21 | # XRay is an exception in that in our example, we need to lower the instruction threshold
22 | # or we'll get an empty trace
23 | CLANGXRAYFLAGS="$CXXFLAGS -fxray-instruction-threshold=1"
24 | 
25 | mkdir -p out
26 | 
27 | CXX=./compiler-wrappers/funtrace-finstr-g++
28 | $CXX -c simple-example/shared.cpp -fPIC $CXXFLAGS
29 | $CXX -o out/test_shared.fi-gcc.so shared.o -fPIC -shared $CXXFLAGS
30 | $CXX -c simple-example/test.cpp $CXXFLAGS
31 | $CXX -o out/test_trace.fi-gcc test.o out/test_shared.fi-gcc.so $CXXFLAGS
32 | 
33 | CXX=./compiler-wrappers/funtrace-pg-g++
34 | $CXX -c simple-example/shared.cpp -fPIC $CXXFLAGS
35 | $CXX -o out/test_shared.pg.so shared.o -fPIC -shared $CXXFLAGS
36 | $CXX -c simple-example/test.cpp $CXXFLAGS
37 | $CXX -o out/test_trace.pg test.o out/test_shared.pg.so $CXXFLAGS
38 | 
39 | CXX=./compiler-wrappers/funtrace-finstr-clang++
40 | $CXX -c simple-example/shared.cpp -fPIC $CXXFLAGS
41 | $CXX -o out/test_shared.fi-clang.so shared.o -fPIC -shared $CXXFLAGS
42 | $CXX -c simple-example/test.cpp $CXXFLAGS
43 | $CXX -o out/test_trace.fi-clang test.o out/test_shared.fi-clang.so $CXXFLAGS
44 | 
45 | CXX=./compiler-wrappers/funtrace-xray-clang++
46 | $CXX -c simple-example/shared.cpp -fPIC $CLANGXRAYFLAGS
47 | # this command will fail with older clang
48 | $CXX -o out/test_shared.xray.so shared.o -fPIC -shared $CLANGXRAYFLAGS
49 | $CXX -c simple-example/test.cpp $CLANGXRAYFLAGS
50 | $CXX -o out/test_trace.xray test.o out/test_shared.xray.so $CLANGXRAYFLAGS
51 | 


--------------------------------------------------------------------------------
/fun_xray_so.S:
--------------------------------------------------------------------------------
 1 | // with gcc's __fentry__ and __return__, we define them once in the executable
 2 | // and then shared objects loaded by that executable get their references to
 3 | // both of these functions resolved to the ones in the executable.
 4 | //
 5 | // it doesn't work this way with LLVM XRay, because each shared object comes
 6 | // with its own definition of __xray_Function* and there's no trivial way to make
 7 | // the XRay runtime code linked into the .so reference the global functions instead;
 8 | // in particular, the runtime code patching functions to call __xray_Function*
 9 | // (instead of the nops put there by the compiler) might complain that the distance
10 | // between the patched code and the jump target (the executable's __xray_Function*
11 | // implementation) is too large to fit in the encoding of the instruction used
12 | // for the patching (I got to the point where it did complain in testing...)
13 | //
14 | // so instead we replace the LLVM XRay runtime's __xray_Function* implementations
15 | // with jumps to the executable's XRay functions; the executable exports
16 | // exe_xray_Function* implementations for the code below to call.
17 | //
18 | // this code is generated from C code calling one function from another;
19 | // we implement this in assembly since the C code might clobber registers
20 | // which it shouldn't (which doesn't happen under -O3 but we don't want to
21 | // take chances on this - these are not normal function calls that can rely
22 | // on the calling convention)
23 |         .text
24 |         .hidden __xray_FunctionEntry
25 |         .globl  __xray_FunctionEntry
26 |         .p2align        4
27 |         .type   __xray_FunctionEntry,@function
28 | __xray_FunctionEntry:
29 |         .cfi_startproc
30 |         jmp     exe_xray_FunctionEntry@PLT
31 |         .cfi_endproc
32 |         .size   __xray_FunctionEntry, .-__xray_FunctionEntry
33 | 
34 |         .hidden __xray_FunctionExit
35 |         .globl  __xray_FunctionExit
36 |         .p2align        4
37 |         .type   __xray_FunctionExit,@function
38 | __xray_FunctionExit:
39 |         .cfi_startproc
40 |         jmp     exe_xray_FunctionExit@PLT
41 |         .cfi_endproc
42 |         .size   __xray_FunctionExit, .-__xray_FunctionExit
43 | 
44 |         .hidden __xray_FunctionTailExit
45 |         .globl  __xray_FunctionTailExit
46 |         .p2align        4
47 |         .type   __xray_FunctionTailExit,@function
48 | __xray_FunctionTailExit:
49 |         .cfi_startproc
50 |         jmp     exe_xray_FunctionTailExit@PLT
51 |         .cfi_endproc
52 |         .size   __xray_FunctionTailExit, .-__xray_FunctionTailExit
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/funcount_pg.S:
--------------------------------------------------------------------------------
 1 |         .p2align 4
 2 |         .globl  __fentry__
 3 |         .type   __fentry__, @function
 4 |         .globl  __xray_FunctionEntry
 5 |         .type   __xray_FunctionEntry, @function
 6 |         .globl  exe_xray_FunctionEntry
 7 |         .type   exe_xray_FunctionEntry, @function
 8 | __fentry__:
 9 | __xray_FunctionEntry:
10 | exe_xray_FunctionEntry:
11 |         .cfi_startproc
12 | 
13 |         //save registers where arguments might have been passed.
14 |         //(we only want to save __cyg_profile_func_enter - we don't optimize
15 |         //funcount as much as funtrace; our goal is just to count the calls
16 |         //that would have been instrumented by funtrace in -pg/XRay modes)
17 |         sub    $0x38,%rsp
18 |         mov    %rax,(%rsp)
19 |         mov    %rcx,0x8(%rsp)
20 |         mov    %rdx,0x10(%rsp)
21 |         mov    %rsi,0x18(%rsp)                     
22 |         mov    %rdi,0x20(%rsp)
23 |         mov    %r8,0x28(%rsp)
24 |         mov    %r9,0x30(%rsp)
25 |         mov    0x38(%rsp),%rdi
26 |         call   __cyg_profile_func_enter
27 |         mov    0x30(%rsp),%r9
28 |         mov    0x28(%rsp),%r8
29 |         mov    0x20(%rsp),%rdi
30 |         mov    0x18(%rsp),%rsi
31 |         mov    0x10(%rsp),%rdx
32 |         mov    0x8(%rsp),%rcx
33 |         mov    (%rsp),%rax
34 |         add    $0x38,%rsp
35 | 
36 |         .cfi_endproc
37 |         .size   __fentry__, .-__fentry__
38 |         .size   __xray_FunctionEntry, .-__xray_FunctionEntry
39 |         .size   exe_xray_FunctionEntry, .-exe_xray_FunctionEntry
40 | 
41 | 
42 |         .p2align 4
43 |         .globl  __return__
44 |         .type   __return__, @function
45 |         .globl  __xray_FunctionExit
46 |         .type   __xray_FunctionExit, @function
47 |         .globl  exe_xray_FunctionExit
48 |         .type   exe_xray_FunctionExit, @function
49 |         .globl  __xray_FunctionTailExit
50 |         .type   __xray_FunctionTailExit, @function
51 |         .globl  exe_xray_FunctionTailExit
52 |         .type   exe_xray_FunctionTailExit, @function
53 | __return__:
54 | __xray_FunctionExit:
55 | exe_xray_FunctionExit:
56 | __xray_FunctionTailExit:
57 | exe_xray_FunctionTailExit:
58 |         .cfi_startproc
59 | 
60 |         // funcount.cpp doesn't use __cyg_profile_func_exit
61 |         ret
62 | 
63 |         .cfi_endproc
64 |         .size   __return__, .-__return__
65 |         .size   __xray_FunctionExit, .-__xray_FunctionExit
66 |         .size   exe_xray_FunctionExit, .-exe_xray_FunctionExit
67 |         .size   __xray_FunctionTailExit, .-__xray_FunctionTailExit
68 |         .size   exe_xray_FunctionTailExit, .-exe_xray_FunctionTailExit
69 | 


--------------------------------------------------------------------------------
/funcount2sym/src/main.rs:
--------------------------------------------------------------------------------
 1 | use procaddr2sym::ProcAddr2Sym;
 2 | use std::fs::File;
 3 | use std::io::{BufRead, BufReader};
 4 | use std::env;
 5 | 
 6 | macro_rules! fail {
 7 |     ($($arg:tt)*) => {{
 8 |         println!($($arg)*);
 9 |         std::process::exit(1);
10 |     }};
11 | }
12 | 
13 | fn main() {
14 |     let args: Vec<String> = env::args().collect();
15 |     if args.len() != 2 {
16 |         fail!("Usage: {} <funcount.txt> # counts with function names printed to stdout, pipe through c++filt if you want to demangle the symbols", args[0]);
17 |     }
18 | 
19 |     // Open the input file
20 |     let file = File::open(args[1].to_string()).expect("failed to open input file");
21 |     let mut reader = BufReader::new(file);
22 | 
23 |     // Validate and parse the magic strings
24 |     let mut line = String::new();
25 |     reader.read_line(&mut line).expect("failed to read FUNCOUNT");
26 |     if line.trim() != "FUNCOUNT" { fail!("missing FUNCOUNT magic string - got `{}'", line); }
27 |     line.clear();
28 | 
29 |     reader.read_line(&mut line).expect("failed to read PROCMAPS");
30 |     if line.trim() != "PROCMAPS" { fail!("missing PROCMAPS magic string - got `{}'", line); }
31 |     line.clear();
32 | 
33 |     // Read and parse the memory maps
34 |     let mut proc_maps_data = String::new();
35 |     let mut found = false;
36 |     while reader.read_line(&mut line).expect("failure reading input file") > 0 {
37 |         if line.trim() == "COUNTS" {
38 |             found = true;
39 |             break;
40 |         }
41 |         proc_maps_data.push_str(&line);
42 |         line.clear();
43 |     }
44 |     if !found { fail!("COUNTS magic string not found"); }
45 |     line.clear();
46 | 
47 |     let input_source = Some(procaddr2sym::input_source(args[1].to_string()));
48 |     let mut procaddr2sym = ProcAddr2Sym::new();
49 |     procaddr2sym.input_source = input_source;
50 |     procaddr2sym.set_proc_maps(proc_maps_data.as_bytes());
51 | 
52 |     while reader.read_line(&mut line).expect("failure reading input file") > 0 {
53 |         let parts: Vec<&str> = line.trim().split_whitespace().collect();
54 |         if parts.len() != 2 { fail!("Invalid address-count pair {}", line); }
55 | 
56 |         let address = u64::from_str_radix(parts[0].trim_start_matches("0x"), 16).expect("bad address");
57 |         let count = parts[1].parse::<u64>().expect("bad count");
58 | 
59 |         let syminfo = procaddr2sym.proc_addr2sym(address);
60 | 
61 |         println!("{} {:#x} {:#x} {} {} {}:{} {}", count, address, syminfo.static_addr, syminfo.size, syminfo.executable_file, syminfo.file, syminfo.line, syminfo.func);
62 | 
63 |         line.clear();
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/funtrace.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * there are 2 ways to save the trace:
 3 |  *
 4 |  * - by getting & saving aside trace data upon interesting events of your choice,
 5 |  *   and eventually writing them out at the time of your choosing. this is good,
 6 |  *   for instance, for keeping a trace corresponding to the slowest observed
 7 |  *   handling of every kind of event (so you throw out this trace and replace it
 8 |  *   with a new one every time you observe an even slower event), and writing
 9 |  *   it all out upon request or when the program terminates.
10 |  *
11 |  * - by writing to the funtrace.raw file (which is only opened if you call
12 |  *   funtrace_pause_and_write_current_snapshot() or use `kill -SIGTRAP` on
13 |  *   the process). this is good if you detect moments of peak
14 |  *   load and want to write the data out immediately, without wasting memory
15 |  *   for keeping the trace data beyond the cyclic buffers already allocated
16 |  *   to collect the trace in the first place (the data is written out
17 |  *   from these buffers while collecting new trace data is paused - same
18 |  *   as it's paused when data is saved aside for writing out later, but
19 |  *   for a longer period of time.) the downside is that you can't "unwrite"
20 |  *   the trace data, and you don't choose when to handle the writing but
21 |  *   rather have it occur immediately after deciding to save the trace.
22 |  */
23 | #pragma once
24 | 
25 | #include <stdint.h>
26 | 
27 | #ifdef __cplusplus
28 | extern "C" {
29 | #endif
30 | 
31 | /* to "just append the current trace snapshot to funtrace.raw", all you need
32 |    is this function (this is also what SIGTRAP does unless you compile with
33 |    -DFUNTRACE_NO_SIGTRAP)
34 | 
35 |    threads cannot be created, and their termination is delayed until the data
36 |    is fully written out
37 | 
38 |    note that if a shared object was unloaded during the time range in the snapshot
39 |    (thankfully not a very common scenario), function calls traced from this shared
40 |    object will not be possible to decode to symbolic function names (this is true
41 |    for all the functions taking snapshots below)
42 |  */
43 | void funtrace_pause_and_write_current_snapshot();
44 | 
45 | /* these methods are for saving trace data snapshots, and then
46 |    writing them out at the time of your choosing. */
47 | 
48 | struct funtrace_snapshot;
49 | 
50 | /* a snapshot has the size FUNTRACE_BUF_SIZE times the number of threads alive
51 |    at the time when it's taken. threads can't be created and can't terminate
52 |    until the trace data is copied into the snapshot */
53 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot();
54 | 
55 | /* you might also want to only get the data up to a certain age,
56 |    both to save time & space and to get "the part you want" (like from the
57 |    start of handling some event till the end) */
58 | uint64_t funtrace_time(); /* timestamp from the same source used for tracing */
59 | uint64_t funtrace_ticks_per_second(); /* funtrace_time()/funtrace_ticks_per_second() converts time to seconds */
60 | 
61 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot_starting_at_time(uint64_t time);
62 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot_up_to_age(uint64_t max_event_age);
63 | void funtrace_free_snapshot(struct funtrace_snapshot* snapshot); /* nop if NULL is passed */
64 | 
65 | /* writing out a sample into its own file after it was obtained with funtrace_pause_and_get_snapshot()
66 |    does not interfere with threads starting and terminating. TODO: we could add a version with
67 |    a "write_data" callback instead of a filename given demand */
68 | void funtrace_write_snapshot(const char* filename, struct funtrace_snapshot* snapshot);
69 | 
70 | /* this is useful to save memory for the event buffer in threads you don't want to trace,
71 |    and also to save some but not all of the function call overhead due to being compiled
72 |    with tracing enabled */
73 | void funtrace_ignore_this_thread();
74 | 
75 | /* set this thread's buffer size (must be a power of 2, so defined by a log value,
76 |    which must be larger the log of the size of 2 events;
77 |    using a smaller value is equivalent to callung funtrace_ignore_this_thread()). */
78 | void funtrace_set_thread_log_buf_size(int log_buf_size);
79 | 
80 | /* disabling tracing will speed things up slightly. note that we don't
81 |    free the buffers when disabling tracing and don't reallocate them
82 |    when enabling tracing. funtrace_ignore_this_thread() is how you free
83 |    the buffer of a thread. */
84 | void funtrace_disable_tracing();
85 | void funtrace_enable_tracing();
86 | 
87 | #ifdef __clang__
88 | #define NOFUNTRACE __attribute__((xray_never_instrument)) __attribute__((no_instrument_function))
89 | #define DOFUNTRACE __attribute__((xray_always_instrument))
90 | #else
91 | #define NOFUNTRACE __attribute__((no_instrument_function))
92 | #define DOFUNTRACE /* gcc doesn't have an attribute to force instrumentation */
93 | #endif
94 | 
95 | #ifdef __cplusplus
96 | }
97 | #endif
98 | 


--------------------------------------------------------------------------------
/funtrace_gdb.py:
--------------------------------------------------------------------------------
  1 | # `info proc mappings` format:
  2 | #         Start Addr           End Addr       Size     Offset objfile
  3 | #     0x555555554000     0x555555556000     0x2000        0x0 /path/to/file
  4 | #
  5 | # /proc/self/maps format - start-end permissions offset device inode /path/to/file 
  6 | # 7f74a4ae6000-7f74a4b08000 r--p 00000000 103:07 109578392                 /usr/lib/x86_64-linux-gnu/libc-2.31.so
  7 | 
  8 | import gdb, struct, traceback
  9 | 
 10 | def write_chunk(f, magic, content):
 11 |     assert len(magic)==8
 12 |     f.write(magic)
 13 |     f.write(struct.pack('Q', len(content)))
 14 |     f.write(content)
 15 | 
 16 | def write_proc_maps(f):
 17 |     mappings = gdb.execute('info proc mappings', from_tty=False, to_string=True)
 18 |  
 19 |     proc_maps = b''
 20 |     for line in mappings.strip().split('\n'):
 21 |         line = line.strip()
 22 |         if line.startswith('0x'):
 23 |             t = line.split()
 24 |             if len(t) == 5: # we don't care about unnamed segments
 25 |                 start, end, size, offset, path = line.split()
 26 |                 # we don't care about permissions, device and inode
 27 |                 proc_maps += b'%10x-%10x r-xp %08x 0:0 0 %s\n'%(int(start,16), int(end,16), int(offset,16), bytes(path,encoding='utf-8'))
 28 |  
 29 |     print('funtrace: saving proc mappings')
 30 |     write_chunk(f, b'PROCMAPS', proc_maps)
 31 | 
 32 | def get_vector_elements(v):
 33 |     vis = gdb.default_visualizer(v)
 34 |     if vis:
 35 |         return [elem for _,elem in list(vis.children())]
 36 |     else: # no pretty printers - assume we know the representation
 37 |         start = v['_M_impl']['_M_start']
 38 |         finish = v['_M_impl']['_M_finish']
 39 | 
 40 |         return [start[i] for i in range(finish-start)] 
 41 | 
 42 | def get_string(s):
 43 |     vis = gdb.default_visualizer(s)
 44 |     if False and vis: # the False comments it out since we get garbage like "<repeats 11 times>" with this at times,
 45 |         # and I haven't found a way to avoid this
 46 |         return str(vis.to_string().value())[1:-1]
 47 |     else: # rely on the representation
 48 |         length = int(s['_M_string_length'])
 49 |         data_ptr = s['_M_dataplus']['_M_p']
 50 | 
 51 |         return str(gdb.selected_inferior().read_memory(data_ptr, length), encoding='utf-8')
 52 |         return str(gdb.Value(data_ptr.cast(gdb.lookup_type('char').pointer()).string(length=length)))[1:-1]
 53 | 
 54 | def write_ftrace(f):
 55 |     try:
 56 |         handler = gdb.parse_and_eval('g_ftrace_handler')
 57 |     except:
 58 |         print("funtrace: compiled without ftrace support - can't fetch last ftrace events")
 59 |         return
 60 | 
 61 |     events = get_vector_elements(handler['events'])
 62 |     pos = int(handler['pos'])
 63 |     lines = []
 64 |     def collect_lines(start, end):
 65 |         lines.extend([get_string(events[i]['line']) for i in range(start, end) if int(events[i]['timestamp'])])
 66 |     collect_lines(pos, len(events)) # these are the older ones
 67 |     collect_lines(0, pos)
 68 | 
 69 |     print(f'funtrace: saving {len(lines)} ftrace events')
 70 |     write_chunk(f, b'FTRACETX', ('\n'.join(lines) + '\n').encode('utf-8'))
 71 | 
 72 | def write_funtrace(f):
 73 |     p_trace_state = gdb.parse_and_eval('g_p_trace_state')
 74 |     if p_trace_state == gdb.Value(0).cast(p_trace_state.type):
 75 |         print('funtrace not initialized yet - no trace data')
 76 |         return
 77 |     trace_state = p_trace_state.dereference()
 78 | 
 79 |     write_chunk(f, b'FUNTRACE', struct.pack('Q', int(trace_state['cpu_freq'])))
 80 | 
 81 |     cmdline = get_string(trace_state['cmdline'])
 82 |     print(f'funtrace: core dump generated by `{cmdline}`')
 83 |     write_chunk(f, b'CMD LINE', cmdline.encode('utf-8'))
 84 | 
 85 |     thread_traces = get_vector_elements(trace_state['thread_traces'])
 86 |     for i, trace in enumerate(thread_traces):
 87 |         # separately add ftrace
 88 |         trace = trace.dereference()
 89 |         thread_id = trace['id']
 90 |         buf_size = trace['buf_size']
 91 |         write_chunk(f, b'THREADID', bytes(gdb.selected_inferior().read_memory(thread_id.address, thread_id.type.sizeof)))
 92 | 
 93 |         buf = trace['buf']
 94 |         data = bytes(gdb.selected_inferior().read_memory(buf, buf_size))
 95 |         name = thread_id["name"].string()
 96 |         print(f'funtrace: thread {thread_id["tid"]} {name} - saving {buf_size} bytes of data read from {buf}')
 97 |         write_chunk(f, b'TRACEBUF', data)
 98 | 
 99 |     write_ftrace(f)
100 | 
101 |     write_chunk(f, b'ENDTRACE', b'')
102 |     print('funtrace: done - decode with `funtrace2viz funtrace.raw out` and then view in viztracer (pip install viztracer) with `vizviewer out.json`')
103 | 
104 | class FuntraceCmd(gdb.Command):
105 |     '''prints the content of the funtrace event buffers into ./funtrace.raw
106 | 
107 | you can then decode that file using funtrace2viz and open the output JSON files with vizviewer
108 | (installed by `pip install viztracer`) or Perfetto (https://ui.perfetto.dev, click
109 | "Open with legacy UI" - no source access unlike in vizviewer but otherwise should work)'''
110 | 
111 |     def __init__(self):
112 |         super(FuntraceCmd, self).__init__("funtrace", gdb.COMMAND_DATA)
113 | 
114 |     def invoke(self, arg, from_tty):
115 |         try:
116 |             with open('funtrace.raw', 'wb') as f:
117 |                 write_proc_maps(f)
118 |                 write_funtrace(f)
119 |         except:
120 |             traceback.print_exc()
121 |             raise
122 | 
123 | FuntraceCmd()
124 | 


--------------------------------------------------------------------------------
/funtrace_pg.S:
--------------------------------------------------------------------------------
  1 | #ifdef FUNTRACE_FUNCOUNT
  2 | #include "funcount_pg.S"
  3 | #else
  4 | 
  5 | #include "funtrace_flags.h"
  6 | 
  7 |         .p2align 4
  8 |         .globl  __fentry__
  9 |         .type   __fentry__, @function
 10 |         .globl  __xray_FunctionEntry
 11 |         .type   __xray_FunctionEntry, @function
 12 |         .globl  exe_xray_FunctionEntry
 13 |         .type   exe_xray_FunctionEntry, @function
 14 | __fentry__:
 15 | __xray_FunctionEntry:
 16 | exe_xray_FunctionEntry:
 17 |         .cfi_startproc
 18 |         // r11 = g_thread_trace.pos
 19 |         movq    %fs:g_thread_trace@tpoff, %r11
 20 |         // cyclic buffer wraparound - clear the FUNTRACE_LOG_BUF_SIZE bit in pos
 21 |         andq    %fs:8+g_thread_trace@tpoff, %r11
 22 |         // if(!g_thread_trace.wraparound_mask) return
 23 |         je      .early_exit_from_fentry
 24 | 
 25 |         // r10 = __builtin_return_address(0)
 26 |         movq    (%rsp), %r10
 27 |         // rdtsc clobbers rdx which might have been used for a caller's parameter - save
 28 |         pushq   %rdx
 29 | 
 30 |         // rax = __rdtsc()
 31 |         rdtsc
 32 |         salq    $32, %rdx
 33 |         orq     %rdx, %rax
 34 |         // pos->func = return_address
 35 |         movq    %r10, (%r11)
 36 |         // pos++
 37 |         addq    $16, %r11
 38 |         // pos->cycle = rdtsc (the pos _before_ the increment; gcc generated this code...)
 39 |         movq    %rax, -8(%r11)
 40 |         // save pos back to g_thread_trace.pos
 41 |         movq    %r11, %fs:g_thread_trace@tpoff
 42 | 
 43 |         popq   %rdx
 44 | .early_exit_from_fentry:
 45 |         ret
 46 | 
 47 | // XRay instrumentation (unlike __fentry__/__return__ -pg instrumentation) calls separate
 48 | // functions upon returning from a function and upon tail-calling a function instead of
 49 | // returning from its caller. you would think this lets us present the tail call correctly (instead
 50 | // of, given f which calls g which tail-calls h, misprepresent h as being called from f,
 51 | // which we end up doing under -pg instrumentation because __return__ is called by g
 52 | // before jumping to h)
 53 | //
 54 | // however, in practice, it's not clear how to make use of the tail-call vs return distinction.
 55 | // for example, you push the tail-caller to the stack and pop it when its callee returns; this
 56 | // works well if this tail-callee is instrumented or calls at least one instrumented function
 57 | // itself, but what if it doesn't - when is the tail-caller going to be "diagnosed" as having
 58 | // returned in that case?
 59 | //
 60 | // XRay itself records distinct event types, EXIT and TAIL_EXIT, and then xray-converter.cpp
 61 | // treats them exactly the same in exportAsChromeTraceEventFormat(). we simply record a single
 62 | // event type for both events
 63 |         .cfi_endproc
 64 |         .size   __fentry__, .-__fentry__
 65 |         .size   __xray_FunctionEntry, .-__xray_FunctionEntry
 66 |         .size   exe_xray_FunctionEntry, .-exe_xray_FunctionEntry
 67 | 
 68 | 
 69 |         .p2align 4
 70 |         .globl  __return__
 71 |         .type   __return__, @function
 72 |         .globl  __xray_FunctionExit
 73 |         .type   __xray_FunctionExit, @function
 74 |         .globl  exe_xray_FunctionExit
 75 |         .type   exe_xray_FunctionExit, @function
 76 |         .globl  __xray_FunctionTailExit
 77 |         .type   __xray_FunctionTailExit, @function
 78 |         .globl  exe_xray_FunctionTailExit
 79 |         .type   exe_xray_FunctionTailExit, @function
 80 | __return__:
 81 | __xray_FunctionExit:
 82 | exe_xray_FunctionExit:
 83 | __xray_FunctionTailExit:
 84 | exe_xray_FunctionTailExit:
 85 |         .cfi_startproc
 86 | 
 87 |         movq    %fs:g_thread_trace@tpoff, %r11
 88 |         andq    %fs:8+g_thread_trace@tpoff, %r11
 89 |         je      .early_exit_from_return
 90 | 
 91 |         movq    (%rsp), %r10
 92 | 
 93 |         //rdtsc clobbers both of these; __return__ can't clobber rax
 94 |         //(unlike __fentry__ which can.) note that the opposite isn't true -
 95 |         //__return__ can't clobber rdx "symmetrically" to __fentry__'s clobbering
 96 |         //of rax, because a tail call can happen after the call to __return__
 97 |         //(not sure why gcc does it this way but it does) and this tail call
 98 |         //might get an argument in rdx
 99 |         pushq   %rdx
100 |         pushq   %rax
101 |         
102 |         rdtsc
103 |         salq    $32, %rdx
104 |         orq     %rdx, %rax
105 |         //this is the main addition in __return__ to the code of __fentry__
106 | #ifndef __clang__
107 |         btsq    $FUNTRACE_RETURN_BIT, %r10
108 | #else
109 |         btsq    $FUNTRACE_RETURN_WITH_CALLER_ADDRESS_BIT, %r10 //XRay jumps to the exit handler
110 |         //rather than calling it; you get an integer ID of the returning function in a register
111 |         //but we don't use it (it's not that trivial to decode, and XRay itself still doesn't
112 |         //do it for functions in shared objects as of early 2025). our return address thus
113 |         //points into the caller of the returning function
114 | #endif
115 |         movq    %rax, 8(%r11)
116 |         addq    $16, %r11
117 |         movq    %r10, -16(%r11)
118 |         movq    %r11, %fs:g_thread_trace@tpoff
119 | 
120 |         popq   %rax
121 |         popq   %rdx
122 | .early_exit_from_return:
123 |         ret
124 | 
125 |         .cfi_endproc
126 |         .size   __return__, .-__return__
127 |         .size   __xray_FunctionExit, .-__xray_FunctionExit
128 |         .size   exe_xray_FunctionExit, .-exe_xray_FunctionExit
129 |         .size   __xray_FunctionTailExit, .-__xray_FunctionTailExit
130 |         .size   exe_xray_FunctionTailExit, .-exe_xray_FunctionTailExit
131 | #endif
132 | 


--------------------------------------------------------------------------------
/compiler-wrappers/funtrace++:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | '''
  3 | usage: funtrace++ </path/to/clang++ or g++> [-funtrace-instr-thresh=N] [-funtrace-no-trace=<mangled-names-list-file>] [-funtrace-do-trace=<names-file>] [-funtrace-ignore-loops] <compiler command>
  4 | '''
  5 | import os, sys, subprocess
  6 | 
  7 | class TraceSupressor:
  8 |     def __init__(self, args):
  9 |         self.instr_thresh = 0
 10 |         self.ignore_loops = False
 11 |         self.no_trace = []
 12 |         self.do_trace = []
 13 |         self.verbose = False
 14 |         def val(arg): return arg.split('=')[-1]
 15 |         def lines(file): return file, open(file).read().strip().split()
 16 |         for arg in args:
 17 |             if arg.startswith('-funtrace-instr-thresh='):
 18 |                 self.instr_thresh = int(val(arg))
 19 |             elif arg.startswith('-funtrace-no-trace='):
 20 |                 self.no_file, self.no_trace = lines(val(arg))
 21 |             elif arg.startswith('-funtrace-do-trace='):
 22 |                 self.do_file, self.do_trace = lines(val(arg))
 23 |             elif arg == '-funtrace-ignore-loops':
 24 |                 self.ignore_loops = True
 25 |             elif arg == '-funtrace-verbose':
 26 |                 self.verbose = True
 27 | 
 28 |     def suppress(self, funcname, num_instr, loops):
 29 |         if funcname in self.do_trace:
 30 |             if self.verbose:
 31 |                 print(f'{funcname} listed in the do-trace file {self.do_file}')
 32 |             return
 33 |         reason = None
 34 |         if funcname in self.no_trace:
 35 |             reason = f'{funcname} listed in the trace suppression file {self.no_file}'
 36 |         elif num_instr < self.instr_thresh:
 37 |             if not loops:
 38 |                 reason = f'{funcname} has {num_instr} instructions, less than -funtrace-instr-thresh={self.instr_thresh}'
 39 |             elif self.ignore_loops:
 40 |                 reason = f'{funcname} has {num_instr} instructions, less than -funtrace-instr-thresh={self.instr_thresh}; it has {loops} loops but -funtrace-ignore-loops was passed'
 41 |         if self.verbose and reason:
 42 |             print(reason)
 43 |         return reason
 44 | 
 45 | # note that we don't support filtering XRay's output, in part on the theory that it already has
 46 | # -fxray-instruction-threshold=N; to support it, we'd need to look for the NOPs it inserts - it doesn't
 47 | # put in call instructions, that's done by runtime code patching
 48 | hooks = [
 49 |     '__cyg_profile_func_enter',
 50 |     '__cyg_profile_func_exit',
 51 |     '__fentry__',
 52 |     '__return__',
 53 | ]
 54 | 
 55 | def filter_asm(asm_file, suppressor):
 56 |     with open(asm_file) as f:
 57 |         lines = f.read().split('\n')
 58 | 
 59 |     funcname = None
 60 |     infunc = False
 61 |     instrs = 0
 62 |     loops = 0
 63 |     labels = []
 64 |     funcstart = None
 65 | 
 66 |     changed = False
 67 | 
 68 |     for i,line in enumerate(lines):
 69 |         l = line.strip()
 70 |         if l.startswith('.type') and l.endswith('@function'):
 71 |             funcname = l.split(',')[0].split()[-1]
 72 |         elif l == '.cfi_startproc':
 73 |             #print('in func',funcname)
 74 |             infunc = True
 75 |             funcstart = i+1
 76 |             instrs = 0
 77 |             loops = 0
 78 |             labels = []
 79 |         elif l == '.cfi_endproc':
 80 |             #print('end func', funcname, instrs, loops)
 81 |             infunc = False
 82 |             suppression_reason = suppressor.suppress(funcname, instrs, loops)
 83 |             if not suppression_reason:
 84 |                 continue
 85 |             
 86 |             for j in range(funcstart, i):
 87 |                 l = lines[j].strip()
 88 |                 for hook in hooks:
 89 |                     if 'call' in l or 'jmp' in l:
 90 |                         if hook in l:
 91 |                             lines[j] = '# ' + lines[j] + ' # ' + suppression_reason
 92 |                             if 'jmp' in l: # tail call
 93 |                                 lines[j] = '    ret ' + lines[j]
 94 |                             changed = True
 95 |                             break
 96 |         elif infunc:
 97 |             if not l:
 98 |                 continue
 99 |             t = l.split()[0]
100 |             isinstr = line[0].isspace() and not t.startswith('.') and not t.endswith(':')
101 |             if isinstr:
102 |                 instrs += 1
103 |                 for label in labels:
104 |                     if label in l:
105 |                         loops += 1
106 |                         break
107 |             elif t.startswith('.') and t.endswith(':'):
108 |                 labels.append(t[:-1])
109 | 
110 |     if changed:
111 |         with open(asm_file, 'w') as f:
112 |             f.write('\n'.join(lines))
113 | 
114 | def exec_compiler(cmd, execl=True):
115 |     compiler = cmd[0]
116 |     if not os.path.exists(compiler):
117 |         compiler = subprocess.getoutput(f'which {compiler}')
118 |     if execl:
119 |         os.execl(compiler, *cmd)
120 |     else:
121 |         subprocess.run([compiler]+cmd[1:])
122 | 
123 | def compile_filter_and_assemble(cmd, funtrace_args):
124 |     suppressor = TraceSupressor(funtrace_args)
125 |     compile_to_asm_cmd, assemble_cmd, asm_file = compile_and_assemble_commands(cmd)
126 | 
127 |     #print(' '.join(compile_to_asm_cmd))
128 |     exec_compiler(compile_to_asm_cmd, execl=False)
129 | 
130 |     filter_asm(asm_file, suppressor)
131 | 
132 |     #print(' '.join(assemble_cmd))
133 |     exec_compiler(assemble_cmd, execl=False)
134 | 
135 | def compile_and_assemble_commands(cmd):
136 |     ofile = None
137 |     cfile = None
138 |     sfile = None
139 | 
140 |     extensions = 'c cpp cc cxx cp CPP c++ C'.split()
141 |     def is_src_arg(arg):
142 |         if arg.startswith('-'):
143 |            return False
144 |         for ext in extensions:
145 |            if arg.endswith(ext):
146 |                return True
147 | 
148 |     for i,arg in enumerate(cmd):
149 |         if arg == '-o' and i+1 < len(cmd):
150 |             ofile = cmd[i+1]
151 |             sfile = ofile+'.s'
152 |         elif is_src_arg(arg):
153 |             cfile = arg
154 | 
155 |     if cfile:
156 |         if not ofile:
157 |             ofile = cfile[:cfile.rfind('.')] + '.o'
158 |             sfile = cfile[:cfile.rfind('.')] + '.s'
159 |             compile_to_asm_cmd = [('-S' if arg == '-c' else arg) for arg in cmd] + ['-o',sfile]
160 |         else:
161 |             compile_to_asm_cmd = [('-S' if arg == '-c' else (sfile if arg == ofile else arg)) for arg in cmd]
162 |     else:
163 |         print(f'funtrace++ - WARNING: -c passed but could not determine the input source file in `{cmd}`')
164 |         exec_compiler(cmd)
165 | 
166 |     assemble_cmd = [cmd[0], '-c', sfile, '-o', ofile]
167 |     if 'clang' in cmd[0]:
168 |         assemble_cmd += ['-Wa,-W'] # clang produces assembly using MD5 sums for some source files but not others and then the assembler warns of
169 |         # "inconsistent use of md5 sums", not sure how to suppress this better...
170 | 
171 |     return compile_to_asm_cmd, assemble_cmd, sfile
172 | 
173 | def main():
174 |     cmd = sys.argv[1:]
175 |     funtrace_args = [arg for arg in cmd if arg.startswith('-funtrace-')]
176 |     cmd = [arg for arg in cmd if not arg.startswith('-funtrace-')]
177 |     if '-c' in cmd and '-E' not in cmd and '-dM' not in cmd and funtrace_args:
178 |         compile_filter_and_assemble(cmd, funtrace_args)
179 |     else:
180 |         exec_compiler(cmd)
181 | 
182 | if __name__ == '__main__':
183 |     main()
184 | 


--------------------------------------------------------------------------------
/funcount.cpp:
--------------------------------------------------------------------------------
  1 | #include <cassert>
  2 | #include <cstring>
  3 | #include <cstdint>
  4 | #include <atomic>
  5 | #include <cstdio>
  6 | #include <map>
  7 | #include <link.h>
  8 | #include <x86intrin.h>
  9 | 
 10 | #ifndef FUNCOUNT_PAGE_TABLES
 11 | #define FUNCOUNT_PAGE_TABLES 1
 12 | #endif
 13 | 
 14 | #ifdef __clang__
 15 | #define NOINSTR __attribute__((xray_never_instrument)) __attribute__((no_instrument_function))
 16 | #else
 17 | #define NOINSTR __attribute__((no_instrument_function))
 18 | #endif
 19 | #define INLINE __attribute__((always_inline))
 20 | 
 21 | const int PAGE_BITS = 16; //works for a 2-level page table with 48b virtual addresses
 22 | //which is OK for most userspace address spaces
 23 | const int PAGE_SIZE = 1<<PAGE_BITS;
 24 | const uint64_t PAGE_BITS_MASK = PAGE_SIZE-1;
 25 | 
 26 | inline uint64_t NOINSTR high_bits(uint64_t address)
 27 | {
 28 |     uint64_t bits = address >> PAGE_BITS*2;
 29 |     //make sure bits higher than PAGE_BITS*3 are not set
 30 |     assert((bits & PAGE_BITS_MASK) == bits && "pointer has more than 48 bits set - try recompiling funcount.cpp with a larger PAGE_BITS constant");
 31 |     return bits;
 32 | }
 33 | 
 34 | inline uint64_t NOINSTR mid_bits(uint64_t address) { return (address >> PAGE_BITS) & PAGE_BITS_MASK; }
 35 | inline uint64_t NOINSTR low_bits(uint64_t address) { return address & PAGE_BITS_MASK; }
 36 | 
 37 | //8-byte counts have the downside where very short functions are counted together;
 38 | //4-byte counts would have been better for this but would be more likely to overflow
 39 | typedef uint64_t count_t;
 40 | 
 41 | struct CountsPage
 42 | {
 43 |     std::atomic<count_t> counts[PAGE_SIZE/sizeof(count_t)];
 44 |     NOINSTR CountsPage() { memset(counts, 0, sizeof(counts)); }
 45 | };
 46 | 
 47 | struct CountsPagesL1
 48 | {
 49 |     CountsPage* pages[PAGE_SIZE];
 50 |     NOINSTR CountsPagesL1() { memset(pages, 0, sizeof(pages)); }
 51 | };
 52 | 
 53 | struct CountsPagesL2
 54 | {
 55 |     CountsPagesL1* pagesL1[PAGE_SIZE];
 56 |     //this counts function calls in executable segments not mapped at the time
 57 |     //when the code was running (allocate_range() wasn't called); AFAIK this
 58 |     //should be limited to constructors in shared objects (which get called
 59 |     //before we get a chance to call dl_iterate_phdr() to update our view
 60 |     //of the address space)
 61 |     //
 62 |     //note that these misses could be avoided by allocating the pages on demand
 63 |     //when a function is first called; however this slows things down even
 64 |     //if it's done in a non-thread-safe manner (potentially leaking pages and
 65 |     //losing call counts) and more so if it's done with in a thread-safe way
 66 |     //(we have a commit in the history doing this with 2 compare_exchange_strong()
 67 |     //calls.) for the purpose of finding the most commonly called functions
 68 |     //in order to exclude them from funtrace instrumentation, it's probably better
 69 |     //to limit the slowdown (s.t. interactive / real-time flows have some chance
 70 |     //of being usable for collecting statistics) at the expense of missing
 71 |     //constructor calls in dynamic libraries (which are very unlikely to be
 72 |     //where you badly need to suppress funtrace instrumentation because of
 73 |     //its overhead)
 74 |     std::atomic<count_t> unknown;
 75 | 
 76 |     void NOINSTR init()
 77 |     {
 78 |         memset(pagesL1, 0, sizeof(pagesL1));
 79 |         unknown = 0;
 80 |     }
 81 | 
 82 |     void NOINSTR allocate_range(uint64_t base, uint64_t size)
 83 |     {
 84 |         uint64_t start = base & ~PAGE_BITS_MASK;
 85 |         uint64_t end = (base + size + PAGE_SIZE - 1) & ~PAGE_BITS_MASK;
 86 |         for(uint64_t address=start; address<=end; address+=PAGE_SIZE) {
 87 |             auto high = high_bits(address);
 88 |             auto& pages = pagesL1[high];
 89 |             if(!pages) {
 90 |                 pages = new CountsPagesL1;
 91 |             }
 92 | 
 93 |             auto mid = mid_bits(address);
 94 |             auto& page = pages->pages[mid];
 95 |             if(!page) {
 96 |                 page = new CountsPage;
 97 |             }
 98 |         }
 99 |     }
100 | 
101 |     std::atomic<count_t>& INLINE NOINSTR get_count(uint64_t address)
102 |     {
103 |         auto high = high_bits(address);
104 |         auto pages = pagesL1[high];
105 |         if(!pages) {
106 |             return unknown;
107 |         }
108 | 
109 |         auto mid = mid_bits(address);
110 |         auto page = pages->pages[mid];
111 |         if(!page) {
112 |             return unknown;
113 |         }
114 | 
115 |         auto low = low_bits(address);
116 |         return page->counts[low / sizeof(count_t)];
117 |     }
118 | 
119 |     NOINSTR ~CountsPagesL2();
120 | };
121 | 
122 | static CountsPagesL2 g_page_tab[FUNCOUNT_PAGE_TABLES];
123 | 
124 | static inline unsigned int INLINE NOINSTR core_num()
125 | {
126 |     unsigned int aux;
127 |     __rdtscp(&aux);
128 |     return aux & 0xfff;
129 | }
130 | 
131 | extern "C" void NOINSTR __cyg_profile_func_enter(void* func, void* caller)
132 | {
133 |     static_assert(sizeof(count_t) == sizeof(std::atomic<count_t>), "wrong size of atomic<count_t>");
134 |     uint64_t addr = (uint64_t)func;
135 |     int tab_ind = FUNCOUNT_PAGE_TABLES == 1 ? 0 : core_num() % FUNCOUNT_PAGE_TABLES;
136 |     std::atomic<count_t>& count = g_page_tab[tab_ind].get_count(addr);
137 |     count += 1;
138 | }
139 | 
140 | extern "C" void NOINSTR __cyg_profile_func_exit(void* func, void* caller) {}
141 | 
142 | #include <fstream>
143 | #include <vector>
144 | #include <iostream>
145 | 
146 | NOINSTR CountsPagesL2::~CountsPagesL2()
147 | {
148 |     //the first object in the array is constructed first and destroyed last -
149 |     auto last_page_tab = &g_page_tab[0];
150 | 
151 |     std::ofstream out;
152 |     if(this == last_page_tab) {
153 |         out.open("funcount.txt");
154 |         out << "FUNCOUNT\nPROCMAPS\n";
155 |         std::ifstream maps_file("/proc/self/maps", std::ios::binary);
156 |         if (!maps_file.is_open()) {
157 |             std::cerr << "funtrace - failed to open /proc/self/maps, traces will be impossible to decode" << std::endl;
158 |             return;
159 |         }
160 | 
161 |         std::vector<char> maps_data(
162 |             (std::istreambuf_iterator<char>(maps_file)),
163 |             std::istreambuf_iterator<char>());
164 | 
165 |         maps_file.close();
166 |         out.write(&maps_data[0], maps_data.size());
167 |         out << "COUNTS\n";
168 |     }
169 | 
170 |     for(uint64_t hi=0; hi<PAGE_SIZE; ++hi) {
171 |         auto pages = pagesL1[hi];
172 |         if(pages) {
173 |             for(uint64_t mid=0; mid<PAGE_SIZE; ++mid) {
174 |                 auto page = pages->pages[mid];
175 |                 if(page) {
176 |                     for(uint64_t lo=0; lo<PAGE_SIZE/sizeof(count_t); ++lo) {
177 |                         auto& count = page->counts[lo];
178 |                         if(count) {
179 |                             uint64_t address = (hi << PAGE_BITS*2) | (mid << PAGE_BITS) | (lo * sizeof(count_t));
180 |                             if(this == last_page_tab) {
181 |                                 //print the final counts
182 |                                 out << std::hex << "0x" << address << ' ' << std::dec << count << '\n';
183 |                             }
184 |                             else {
185 |                                 //accumulate the results into the first page table
186 |                                 last_page_tab->get_count(address) += count;
187 |                             }
188 |                         }
189 |                     }
190 |                 }
191 |                 pages->pages[mid] = nullptr;
192 |                 delete page;
193 |             }
194 |             pagesL1[hi] = nullptr;
195 |             delete pages;
196 |         }
197 |     }
198 |     if(unknown) {
199 |         if(this == last_page_tab) {
200 |             std::cout << "WARNING: " << unknown << " function calls were to functions in parts of the address space unknown at the time they were made (likely constructors in shared objects)" << std::endl;
201 |         }
202 |         else {
203 |             last_page_tab->unknown += unknown;
204 |         }
205 |     }
206 |     if(this == last_page_tab) {
207 |         std::cout << "function call count report saved to funcount.txt - decode with funcount2sym to get: call_count, dyn_addr, static_addr, num_bytes, bin_file, src_file:src_line, mangled_func_name" << std::endl;
208 |     }
209 | }
210 | 
211 | static int NOINSTR phdr_callback (struct dl_phdr_info *info, size_t size, void *data)
212 | {
213 |     for(int i=0; i<info->dlpi_phnum; ++i ) {
214 |         const auto& phdr = info->dlpi_phdr[i];
215 |         if(phdr.p_type == PT_LOAD && (phdr.p_flags & PF_X)) {
216 |             uint64_t start_addr = info->dlpi_addr + phdr.p_vaddr;
217 |             for(int t=0; t<FUNCOUNT_PAGE_TABLES; ++t) {
218 |                 g_page_tab[t].allocate_range(start_addr, phdr.p_memsz);
219 |             }
220 |         }
221 |     }
222 |     return 0;
223 | }
224 | 
225 | static void NOINSTR allocate_page_tables()
226 | {
227 |     dl_iterate_phdr(phdr_callback, nullptr);
228 | }
229 | 
230 | __attribute__((constructor(101))) void NOINSTR funcount_init()
231 | {
232 |     for(int t=0; t<FUNCOUNT_PAGE_TABLES; ++t) {
233 |         g_page_tab[t].init();
234 |     }
235 |     allocate_page_tables();
236 | }
237 | 
238 | 
239 | extern "C" void* NOINSTR dlopen(const char *filename, int flags)
240 | {
241 |     void* (*orig)(const char*,int) = (void* (*)(const char*,int))dlsym(RTLD_NEXT, "dlopen");
242 |     void* lib = (*orig)(filename, flags);
243 |     allocate_page_tables();
244 |     return lib;
245 | }
246 | 
247 | 
248 | extern "C" void* NOINSTR dlmopen(Lmid_t lmid, const char *filename, int flags)
249 | {
250 |     void* (*orig)(Lmid_t,const char*,int) = (void* (*)(Lmid_t,const char*,int))dlsym(RTLD_NEXT, "dlmopen");
251 |     void* lib = (*orig)(lmid, filename, flags);
252 |     allocate_page_tables();
253 |     return lib;
254 | }
255 | 
256 | //provide empty implementations of the funtrace APIs so that you could use funcount
257 | //with a program calling funtrace APIs easily (and not just the first time you integrate the APIs)
258 | extern "C" {
259 | void funtrace_pause_and_write_current_snapshot() {}
260 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot() { return nullptr; }
261 | uint64_t funtrace_time() { return __rdtsc(); }
262 | uint64_t funtrace_ticks_per_second() { return 1000000000; } //we shouldn't need this to be correct */
263 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot_starting_at_time(uint64_t time) { return nullptr; }
264 | struct funtrace_snapshot* funtrace_pause_and_get_snapshot_up_to_age(uint64_t max_event_age) { return nullptr; }
265 | void funtrace_free_snapshot(struct funtrace_snapshot* snapshot) {}
266 | void funtrace_write_snapshot(const char* filename, struct funtrace_snapshot* snapshot) {}
267 | void funtrace_ignore_this_thread() {}
268 | void funtrace_set_thread_log_buf_size(int log_buf_size) {}
269 | void funtrace_disable_tracing() {}
270 | void funtrace_enable_tracing() {}
271 | }
272 | 
273 | 


--------------------------------------------------------------------------------
/procaddr2sym/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use addr2line::{
  2 |     Context,
  3 |     object,
  4 |     gimli::{EndianReader, RunTimeEndian},
  5 | };
  6 | use addr2line::fallible_iterator::FallibleIterator;
  7 | use std::rc::Rc;
  8 | use procfs::process::{MemoryMaps, MemoryMap, MMapPath};
  9 | use procfs::FromBufRead;
 10 | use goblin::elf::{Elf, ProgramHeader};
 11 | use std::collections::{HashMap, HashSet};
 12 | use std::time::SystemTime;
 13 | use std::fs::File;
 14 | use std::io::Read;
 15 | use std::fs;
 16 | use chrono::{DateTime, Local};
 17 | use memmap2::Mmap;
 18 | use serde_json::Value;
 19 | use cpp_demangle;
 20 | 
 21 | fn find_address_in_maps(address: u64, maps: &Vec<MemoryMap>) -> Option<&MemoryMap> {
 22 |     maps.binary_search_by(|map| {
 23 |         if address < map.address.0 {
 24 |             std::cmp::Ordering::Greater // Address is before this map
 25 |         } else if address >= map.address.1 {
 26 |             std::cmp::Ordering::Less // Address is after this map
 27 |         } else {
 28 |             std::cmp::Ordering::Equal // Address is within this map
 29 |         }
 30 |     })
 31 |     .ok()
 32 |     .map(|index| &maps[index])
 33 | }
 34 | 
 35 | struct Symbol {
 36 |     base_address: u64,
 37 |     size: u64,
 38 |     name: String,
 39 | }
 40 | 
 41 | fn read_elf_symbols(elf: &Elf) -> Vec<Symbol> {
 42 |     // Create a vector to store our symbols
 43 |     let mut symbols = Vec::new();
 44 | 
 45 |     // Process dynamic symbols if they exist
 46 |     for sym in elf.dynsyms.iter() {
 47 |         // Get the symbol name from the dynamic string table
 48 |         if let Some(name) = elf.dynstrtab.get_at(sym.st_name) {
 49 |             symbols.push(Symbol {
 50 |                 base_address: sym.st_value,
 51 |                 size: sym.st_size,
 52 |                 name: name.to_string(),
 53 |             });
 54 |         }
 55 |     }
 56 | 
 57 |     // Process regular symbols if they exist
 58 |     for sym in elf.syms.iter() {
 59 |         // Get the symbol name from the string table
 60 |         if let Some(name) = elf.strtab.get_at(sym.st_name) {
 61 |             symbols.push(Symbol {
 62 |                 base_address: sym.st_value,
 63 |                 size: sym.st_size,
 64 |                 name: name.to_string(),
 65 |             });
 66 |         }
 67 |     }
 68 | 
 69 |     // Sort symbols by base address
 70 |     symbols.sort_by_key(|sym| sym.base_address);
 71 | 
 72 |     symbols
 73 | }
 74 | 
 75 | fn find_symbol(symbols: &Vec<Symbol>, address: u64) -> Option<&Symbol> {
 76 |     // Binary search for the largest base address that's <= our target address
 77 |     let idx = match symbols.binary_search_by_key(&address, |sym| sym.base_address) {
 78 |         Ok(exact) => exact,
 79 |         Err(insert_pos) => {
 80 |             if insert_pos == 0 {
 81 |                 return None;
 82 |             }
 83 |             insert_pos - 1
 84 |         }
 85 |     };
 86 | 
 87 |     // Get candidate symbol and check if address falls within its range
 88 |     let candidate = &symbols[idx];
 89 |     if address >= candidate.base_address && address < candidate.base_address + candidate.size {
 90 |         Some(candidate)
 91 |     } else {
 92 |         None
 93 |     }
 94 | }
 95 | 
 96 | #[derive(Debug)]
 97 | struct SubsPath {
 98 |     src: String,
 99 |     dst: String,
100 | }
101 | 
102 | fn parse_substitute_path_json(file_name: &str) -> Vec<SubsPath> {
103 |     let mut file = match File::open(file_name) {
104 |         Ok(file) => file,
105 |         Err(_) => {
106 |             return Vec::new();
107 |         }
108 |     };
109 | 
110 |     let mut json_str = String::new();
111 |     if let Err(e) = file.read_to_string(&mut json_str) {
112 |         eprintln!("Warning: Failed to read from file '{}': {}", file_name, e);
113 |         return Vec::new();
114 |     }
115 | 
116 |     let json_value: Value = match serde_json::from_str(&json_str) {
117 |         Ok(value) => value,
118 |         Err(e) => {
119 |             eprintln!("Warning: Failed to parse JSON in file '{}': {}", file_name, e);
120 |             return Vec::new();
121 |         }
122 |     };
123 | 
124 |     let mut subs_paths = Vec::new();
125 | 
126 |     if let Some(array) = json_value.as_array() {
127 |         for item in array {
128 |             if let Some(inner_array) = item.as_array() {
129 |                 if inner_array.len() == 2 {
130 |                     if let (Some(src), Some(dst)) = (inner_array[0].as_str(), inner_array[1].as_str()) {
131 |                         subs_paths.push(SubsPath {
132 |                             src: src.to_string(),
133 |                             dst: dst.to_string(),
134 |                         });
135 |                     } else {
136 |                         eprintln!("Warning: Invalid string pair in file '{}'", file_name);
137 |                     }
138 |                 } else {
139 |                     eprintln!("Warning: Array does not contain exactly 2 elements in file '{}'", file_name);
140 |                 }
141 |             } else {
142 |                 eprintln!("Warning: Expected array in file '{}'", file_name);
143 |             }
144 |         }
145 |     } else {
146 |         eprintln!("Warning: Top level object is not an array in file '{}'", file_name);
147 |     }
148 | 
149 |     subs_paths
150 | }
151 | 
152 | struct ExecutableFileMetadata
153 | {
154 |     program_headers: Vec<ProgramHeader>,
155 |     addr2line: Context<EndianReader<RunTimeEndian, Rc<[u8]>>>,
156 |     symbols: Vec<Symbol>,
157 | }
158 | 
159 | pub struct InputSource {
160 |     path: String,
161 |     modified: SystemTime,
162 | }
163 | 
164 | pub fn input_source(path: String) -> InputSource {
165 |     InputSource { path: path.clone(), modified: fs::metadata(path).unwrap().modified().unwrap() }
166 | }
167 | 
168 | pub struct ProcAddr2Sym {
169 |     maps: Vec<MemoryMap>,
170 |     sym_cache: HashMap<String, ExecutableFileMetadata>,
171 |     sym_missing: HashSet<String>,
172 |     offset_cache: HashMap<u64, u64>,
173 |     source_files: HashSet<String>, //kept just to print "modified after the input source" warnings once per file
174 |     subs_path: Vec<SubsPath>,
175 |     pub input_source: Option<InputSource>,
176 | }
177 | 
178 | #[derive(Debug, Clone, Hash, PartialEq, std::cmp::Eq)]
179 | pub struct SymInfo {
180 |     pub func: String, //before c++filt
181 |     pub demangled_func: String, //after c++filt
182 |     //note that these are, whenever possible, the file:line of the FIRST function
183 |     //address, NOT the address passed to proc_addr2sym!
184 |     //TODO: given demand we can provide a way to pass the file:line of the actual
185 |     //address passed to proc_addr2sym
186 |     pub file: String, //source file
187 |     pub line: u32, //line number in the file
188 |     pub executable_file: String, //executable or shared object
189 |     pub static_addr: u64, //the address in the executable's symbol table
190 |     //(without the dynamic offset to which it's loaded - this offset is subtracted
191 |     //from the input address passed to proc_addr2sym()). like file:line, whenever
192 |     //possible, this is the base address of the function, not the address
193 |     //directly corresponding to the input dynamic address
194 |     pub size: u64, //0 if no symbol found
195 | }
196 | 
197 | fn time2str(time: &SystemTime) -> String {
198 |     let datetime: DateTime<Local> = (*time).into();
199 |     datetime.format("%Y-%m-%d %H:%M:%S").to_string()
200 | }
201 | 
202 | //sometimes you will see function names like "f(int) [clone .constprop.1]"
203 | //or "f(int) [clone .cold]", due to the compiler generating multiple copies of the code for various reasons.
204 | //we strip this "[clone .whatever]" stuff, not only because it's not too helpful for human users,
205 | //but because it actively interferes with eg exception handling (when throw/catch return us to "f() [clone .cold]"
206 | //we need to know that it's the same as the "f()" we have on our stack to be able to pop f's callees from the stack;
207 | //we don't want "[clone .cold]" to throw us off)
208 | fn strip_clone(input: String) -> String {
209 |     if let Some(index) = input.find(" [clone ") {
210 |         input[..index].to_string()
211 |     } else {
212 |         input
213 |     }
214 | }
215 | 
216 | impl ProcAddr2Sym {
217 |     pub fn new() -> Self {
218 |         ProcAddr2Sym { maps: Vec::new(), sym_cache: HashMap::new(), sym_missing: HashSet::new(), offset_cache: HashMap::new(), source_files: HashSet::new(),
219 |             subs_path: parse_substitute_path_json("substitute-path.json"), input_source: None }
220 |     }
221 | 
222 |     fn substitute_path(&self, path: String) -> String {
223 |         let mut s = path;
224 |         for subs in &self.subs_path {
225 |             s = s.replace(&subs.src, &subs.dst);
226 |         }
227 |         s
228 |     }
229 | 
230 |     // note that updating the maps doesn't invalidate sym_cache - we don't need to parse
231 |     // the DWARF of the executables / shared objects again; but it does invalidate offset_cache
232 |     // since the same shared object might have been loaded to a different offset
233 |     pub fn set_proc_maps(&mut self, proc_maps_data: &[u8]) {
234 |         let memory_maps = MemoryMaps::from_buf_read(proc_maps_data).expect("failed to parse /proc/self/maps data");
235 |         self.maps = memory_maps.into_iter().collect();
236 |         // not sure we need to sort them - /proc/self/maps appears already sorted - but can't hurt
237 |         self.maps.sort_by_key(|map| map.address.0);
238 |         self.offset_cache = HashMap::new();
239 |     }
240 | 
241 |     pub fn unknown_symbol(&self) -> SymInfo {
242 |         return SymInfo { func: "??".to_string(), demangled_func: "??".to_string(), file: "??".to_string(), line: 0, executable_file: "??".to_string(), static_addr: 0, size: 0 };
243 |     }
244 | 
245 |     pub fn proc_addr2sym(&mut self, proc_address: u64) -> SymInfo {
246 |         let unknown = self.unknown_symbol();
247 |         let map_opt = find_address_in_maps(proc_address, &self.maps);
248 |         if map_opt == None { return unknown; }
249 |         let map = map_opt.unwrap();
250 | 
251 |         let path_opt = match &map.pathname {
252 |             MMapPath::Path(p) => Some(p),
253 |             _ => None,
254 |         };
255 |         if path_opt == None { return unknown; }
256 |         let path = path_opt.unwrap();
257 | 
258 |         let pathstr = self.substitute_path(path.to_string_lossy().to_string());
259 |         if self.sym_missing.contains(&pathstr) {
260 |             return unknown;
261 |         }
262 |         if !self.sym_cache.contains_key(&pathstr) {
263 |             let fileopt = File::open(pathstr.clone());
264 |             if fileopt.is_err() {
265 |                 println!("WARNING: couldn't open executable file {} - you can remap paths using a substitute-path.json file in your working directory", pathstr);
266 |                 self.sym_missing.insert(pathstr);
267 |                 return unknown;
268 |             }
269 |             let file = fileopt.unwrap();
270 |             if let Some(ref input_source) = self.input_source {
271 |                 let modified = fs::metadata(pathstr.clone()).expect("failed to stat file").modified().expect("failed to get last modification timestamp");
272 |                 if modified > input_source.modified {
273 |                     println!("WARNING: executable file {} last modified at {} - later than {} ({})", pathstr, time2str(&modified), input_source.path, time2str(&input_source.modified)); 
274 |                 }
275 |             }
276 |             let buffer = unsafe { Mmap::map(&file).expect("failed to mmap executable file") };
277 |             let elf = Elf::parse(&buffer).expect("Failed to parse ELF");
278 |             let symbols = read_elf_symbols(&elf);
279 |             let program_headers = elf.program_headers.clone();
280 |             let object = object::File::parse(&*buffer).expect("Failed to parse ELF");
281 |             let ctx = addr2line::Context::new(&object).expect("Failed to create addr2line context");
282 |             self.sym_cache.insert(pathstr.clone(), ExecutableFileMetadata { program_headers, addr2line: ctx, symbols });
283 |         }
284 |         let meta = self.sym_cache.get(&pathstr).unwrap();
285 | 
286 |         if !self.offset_cache.contains_key(&map.address.0) {
287 |             //find the program header containing the file offset of this mapping
288 |             let mut found = false;
289 |             for phdr in meta.program_headers.iter() {
290 |                 if map.offset >= phdr.p_offset && map.offset < (phdr.p_offset + phdr.p_filesz) {
291 |                     let vaddr_offset = (map.offset - phdr.p_offset) + phdr.p_vaddr;
292 |                     self.offset_cache.insert(map.address.0, vaddr_offset);
293 |                     found = true;
294 |                     break;
295 |                 }
296 |             }
297 |             if !found { return unknown; } 
298 |         }
299 |         let vaddr_offset = self.offset_cache.get(&map.address.0).unwrap();
300 |         let mut static_addr = proc_address - map.address.0 + vaddr_offset;
301 |         let mut size = 0;
302 | 
303 |         let mut name = "??".to_string();
304 |         let mut demangled_func = "??".to_string();
305 |         let mut name_found = false;
306 | 
307 |         if let Some(sym) = find_symbol(&meta.symbols, static_addr) {
308 |             name_found = true;
309 |             name = sym.name.clone();
310 |             static_addr = sym.base_address;
311 |             size = sym.size; 
312 |             if let Ok(demsym) = cpp_demangle::Symbol::new(name.clone()) {
313 |                 demangled_func = demsym.to_string();
314 |             }
315 |             else {
316 |                 demangled_func = name.clone();
317 |             }
318 |         }
319 | 
320 |         let (file, linenum) = match meta.addr2line.find_location(static_addr) {
321 |             Ok(Some(location)) => (location.file.unwrap_or("??"), location.line.unwrap_or(0)),
322 |             _ => ("??",0),
323 |         };
324 |         let file = self.substitute_path(file.to_string());
325 |         if let Some(ref input_source) = self.input_source {
326 |             let file = file.clone();
327 |             if !self.source_files.contains(&file) {
328 |                 //don't warn if we can't access the file (maybe the source code isn't supposed to be
329 |                 //on this machine or it's a relative path or whatever); do warn if we can access it and it's newer than the data
330 |                 //source - very likely a mistake the user should be aware of
331 |                 if let Ok(meta) = fs::metadata(file.clone()) {
332 |                     if let Ok(modified) = meta.modified() {
333 |                         if modified > input_source.modified {
334 |                             println!("WARNING: source file {} last modified at {} - later than {} ({})", file, time2str(&modified), input_source.path, time2str(&input_source.modified)); 
335 |                         }
336 |                     }
337 |                 }
338 |                 self.source_files.insert(file);
339 |             }
340 |         }
341 | 
342 |         if !name_found {
343 |             //not sure if we are ever going to meet a case where there's no ELF symbol name
344 |             //but we do have DWARF debug info but can't hurt to try.
345 |             //
346 |             //there are at least 3 reasons not to use this code by itself, without bothering
347 |             //with ELF symbol tables at all:
348 |             //
349 |             //* sometimes you have ELF symbols but no DWARF debug info
350 |             //* some functions (such as "virtual" and "non-virtual" "thunks" auto-generated by gcc
351 |             //  have an ELF symbol but no debug info in DWARF (at least not function name info;
352 |             //  and incidentally we very much _need_ this info because such thunks have __return__
353 |             //  without __fentry__ and we need to keep this from mauling the decoded trace)
354 |             //* we want, at least in funtrace's context, to find file:line of the first function
355 |             //  address, which the ELF symbol readily makes available
356 |             //
357 |             //but it seems harmless to keep this code as fallback just in case
358 |             //(in any case we use addr2line for the file:line info so "the object is already there".)
359 |             if let Ok(frames) = meta.addr2line.find_frames(static_addr).skip_all_loads() { 
360 |                 if let Ok(Some(frame)) = frames.last() {
361 |                     if let Some(funref) = frame.function.as_ref() {
362 |                         if let Ok(fname) = funref.raw_name() {
363 |                             name = fname.to_string();
364 |                             demangled_func = name.clone();
365 |                         }
366 |                         if let Ok(dname) = funref.demangle() {
367 |                             demangled_func = dname.to_string();
368 |                         }
369 |                     }
370 |                 }
371 |             }
372 |         }
373 |         SymInfo{func:strip_clone(name), demangled_func:strip_clone(demangled_func), file, line:linenum, executable_file:pathstr, static_addr, size}
374 |     }
375 | }
376 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | import json
  3 | import os
  4 | import glob
  5 | from multiprocessing import Pool
  6 | 
  7 | call='+'
  8 | ret='-'
  9 | 
 10 | def parse_perfetto_json(fname):
 11 |     with open(fname) as f:
 12 |         data = json.load(f)
 13 |     events = data['traceEvents']
 14 |     threads = {}
 15 |     thread_names = {}
 16 |     thread2timestamps = {}
 17 |     for event in events:
 18 |         phase = event['ph']
 19 |         tid = event['tid']
 20 |         name = event['name']
 21 |         if 'std::thread::_Invoker' in name or 'std::thread::thread' in name: # we use std::thread in tests - ignore the noise it adds to traces
 22 |             continue # 
 23 |         if phase == 'M': # metadata
 24 |             if name == 'thread_name':
 25 |                 thread_names[tid] = event['args']['name']
 26 |                 if thread_names[tid] in threads: # not a unique name
 27 |                     thread_names[tid] += '.%d'%tid # mangle by tid
 28 |             continue
 29 |         assert phase == 'X' # complete event
 30 |         timepoints = threads.setdefault(thread_names[tid], list())
 31 |         timestamp = event['ts']
 32 |         duration = event['dur']
 33 | 
 34 |         timestamps = thread2timestamps.setdefault(tid, dict())
 35 | 
 36 |         assert timestamp not in timestamps, f'expecting unique timestamps in every thread! 2 events with the same timestamp: call of {event}; {timestamps[timestamp]}'
 37 |         assert timestamp+duration not in timestamps, f'expecting unique timestamps in every thread! 2 events with the same timestamp: return of {event}; {timestamps[timestamp+duration]}'
 38 |         timestamps[timestamp] = ('call',event)
 39 |         timestamps[timestamp+duration] = ('ret',event)
 40 | 
 41 |         timepoints.append((call, name, timestamp))
 42 |         timepoints.append((ret, name, timestamp+duration))
 43 | 
 44 |     # sort by the timestamp
 45 |     for timepoints in threads.values():
 46 |         timepoints.sort(key=lambda t: (t[2]))
 47 | 
 48 |     data['threads'] = threads
 49 | 
 50 |     return data
 51 | 
 52 | def print_thread(flow,line=-1):
 53 |     level = 0
 54 |     for i,point in enumerate(flow):
 55 |         what = point[0]
 56 |         name = point[1]
 57 |         if what == ret:
 58 |             level -= 1
 59 |         start = '  '*level
 60 |         if line>=0:
 61 |             if i<line:
 62 |                 start='|'+start
 63 |             elif i==line:
 64 |                 start='V'+start
 65 |             else:
 66 |                 start=' '+start
 67 |         print(start,what,name)
 68 |         if what == call:
 69 |             level += 1
 70 | 
 71 | def verify_thread(timepoints, ref_calls_and_returns):
 72 |     ok = True
 73 |     n = len(timepoints)
 74 |     errline = -1
 75 |     if len(timepoints) != len(ref_calls_and_returns):
 76 |         print(f'mismatch in the number of events (expected {len(ref_calls_and_returns)}, found {n})')
 77 |         ok = False
 78 |         n = min(n, len(ref_calls_and_returns))
 79 |     for i,((what_ref,func),(what,name,_)) in enumerate(zip(ref_calls_and_returns[:n], timepoints[:n])):
 80 |         if what != what_ref or (func+'(' not in name and func+' ' not in name):
 81 |             print('expected',what_ref,func,', found',what,name)
 82 |             ok = False
 83 |             errline = i
 84 |             break
 85 |     if not ok:
 86 |         print('expected:')
 87 |         print_thread(ref_calls_and_returns,errline)
 88 |         print('found:')
 89 |         print_thread(timepoints,errline)
 90 |     return ok
 91 | 
 92 | def fn(name, inner=[]):
 93 |     return [(call,name)] + inner + [(ret,name)]
 94 | 
 95 | # check that the ignored threads as well as the part running when tracing was
 96 | # disabled and before it was enabled again weren't traced. there should be 2 children threads
 97 | ignore_disable_main_ref = [
 98 |     (call,'run_threads'),
 99 |     (call,'should_be_traced'),
100 |     (ret,'should_be_traced'),
101 |     (ret,'run_threads'),
102 | ]*2
103 | ignore_disable_child_ref = [
104 |     (call,'traced_thread'),
105 |     (call,'should_be_traced'),
106 |     (ret,'should_be_traced'),
107 |     (ret,'traced_thread'),
108 | ]
109 | 
110 | exceptions_ref = [
111 |     (call,'caller'),
112 |     (call,'catcher'),
113 |     (call,'before_try'),
114 |     (ret,'before_try'),
115 |     (call,'wrapper_call_outer'),
116 |     (call,'wrapper_tailcall_2'),
117 |     (call,'wrapper_tailcall_1'),
118 |     (call,'wrapper_call'),
119 |     (call,'thrower'),
120 |     (call,'__cxa_throw'),
121 |     (ret,'__cxa_throw'),
122 |     (ret,'thrower'), # throw going to the catch block should be decoded as all the
123 |     # unwound functions returning
124 |     (ret,'wrapper_call'),
125 |     (ret,'wrapper_tailcall_1'),
126 |     (ret,'wrapper_tailcall_2'),
127 |     (ret,'wrapper_call_outer'),
128 |     (call,'__cxa_begin_catch'),
129 |     (ret,'__cxa_begin_catch'),
130 |     (call,'after_catch'),
131 |     (ret,'after_catch'),
132 |     (call,'__cxa_end_catch'),
133 |     (ret,'__cxa_end_catch'),
134 |     (ret,'catcher'),
135 |     (ret,'caller'),
136 | ]*3
137 | 
138 | # the "clean" case [supplied by gcc -finstrument-functions] is for the untraced
139 | # catcher to simply disappear from the trace but without any other artifacts
140 | clean_untraced_caller_ref = [(evt,func) for evt,func in exceptions_ref if func!='catcher']
141 | 
142 | unfortunate_full_unwinding = [
143 |     (call,'caller'),
144 |     (call,'before_try'),
145 |     (ret,'before_try'),
146 |     (call,'wrapper_call_outer'),
147 |     (call,'wrapper_tailcall_2'),
148 |     (call,'wrapper_tailcall_1'),
149 |     (call,'wrapper_call'),
150 |     (call,'thrower'),
151 |     (call,'__cxa_throw'),
152 |     (ret,'__cxa_throw'),
153 |     (ret,'thrower'), # throw going to the catch block should be decoded as all the
154 |     # unwound functions returning
155 |     (ret,'wrapper_call'),
156 |     (ret,'wrapper_tailcall_1'),
157 |     (ret,'wrapper_tailcall_2'),
158 |     (ret,'wrapper_call_outer'),
159 |     (ret,'caller'),
160 |     (call,'__cxa_begin_catch'),
161 |     (ret,'__cxa_begin_catch'),
162 |     (call,'after_catch'),
163 |     (ret,'after_catch'),
164 |     (call,'__cxa_end_catch'),
165 |     (ret,'__cxa_end_catch'),
166 | ]
167 | 
168 | def caller_wrapping(events): return [(call,'caller')] + events + [(ret,'caller')]
169 | def catcher_wrapping(events): return [(call,'catcher')] + events + [(ret,'catcher')]
170 | 
171 | # in the "dirty" untraced caller case, we have 2 artifacts:
172 | # 1. the call stack is fully unwound since we don't know where to stop (the catcher's call event wasn't traced;
173 | #    so for all we know the catcher was the outermost caller)
174 | # 2. when the caller of the catcher returns, it is treated as an "orphan return" and we get the "illusion"
175 | #    that the caller was called more than it actually was since we make up a call event for the orphan return
176 | #    (this artifact could be avoided in most cases by doing more work in the tracer but it wouldn't
177 | #    solve the 1st artifact, definitely not when the return from the caller was never logged, eg because it didn't happen
178 | #    by the time the snapshot was taken)
179 | dirty_untraced_caller_ref = caller_wrapping(caller_wrapping(caller_wrapping(unfortunate_full_unwinding) + unfortunate_full_unwinding) + unfortunate_full_unwinding)
180 | # with XRay, the catcher is the one for which we see "orphan returns" after unwinding [due to XRay's funky return address logging]
181 | dirty_untraced_catcher_xray_ref = catcher_wrapping(catcher_wrapping(catcher_wrapping(unfortunate_full_unwinding) + unfortunate_full_unwinding) + unfortunate_full_unwinding)
182 | 
183 | # test that the traced funcs were actually traced and that untraced ones weren't.
184 | # if potentially non-trivial, it's mainly for XRay with its return address logging
185 | untraced_funcs_ref = [
186 |     (call, 'tr2'),
187 |     (call, 'tr1'),
188 |     (ret, 'tr1'),
189 |     (ret, 'tr2'),
190 |     (call, 'tr1'),
191 |     (ret, 'tr1'),
192 |     (call, 'tr4'),
193 |     (call, 'tr3'),
194 |     (ret, 'tr3'),
195 |     (ret, 'tr4'),
196 |     (call, 'tr4'),
197 |     (call, 'tr3'),
198 |     (ret, 'tr3'),
199 |     (ret, 'tr4'),
200 | ]
201 | 
202 | longjmp_ref = [
203 |     (call,'setter'),
204 |     (call,'before_setjmp'),
205 |     (ret,'before_setjmp'),
206 |     (call,'wrapper_call_outer'),
207 |     (call,'wrapper_call'),
208 |     (call,'jumper'),
209 |     (call,'after_longjmp'), # this is not the call sequence in the code - after_longjmp is called
210 |     # after we return from setjmp in setter - but this is what we expect to decode since we don't
211 |     # log longjmp events, rather this is a test of our partial recovery from seeing a return from
212 |     # setter instead of a return from jumper, which we expect to see after after_longjmp returns
213 |     (ret,'after_longjmp'),
214 |     (ret,'jumper'),
215 |     (ret,'wrapper_call'),
216 |     (ret,'wrapper_call_outer'),
217 |     (ret,'setter'),
218 | ]*3
219 | 
220 | def orphans_ref(json):
221 |     # XRay instrumentation loses the address of the first orphan return
222 |     first_returning_function = '??' if 'xray' in json else 'orphan_return_3'
223 |     return [
224 |         # the first 3 call events are fake...
225 |         (call,'orphan_return_1'),
226 |         (call,'orphan_return_2'),
227 |         (call,first_returning_function),
228 |         (ret,first_returning_function),
229 |         (ret,'orphan_return_2'),
230 |         (call,'called_and_returned'),
231 |         (ret,'called_and_returned'),
232 |         (ret,'orphan_return_1'),
233 |         (call,'called_and_returned'),
234 |         (ret,'called_and_returned'),
235 |         (call,'orphan_call_1'),
236 |         (call,'called_and_returned'),
237 |         (ret,'called_and_returned'),
238 |         (call,'orphan_call_2'),
239 |         (call,'called_and_returned'),
240 |         (ret,'called_and_returned'),
241 |         # ...and so are the 2 return events
242 |         (ret,'orphan_call_2'),
243 |         (ret,'orphan_call_1'),
244 |     ]
245 | 
246 | buf_size_ref = [
247 |     (call,'f'),
248 |     (ret,'f'),
249 | ]
250 | 
251 | shared_ref = fn('loop',
252 |     (fn('h', fn('g',fn('f')*2)+fn('f'))+
253 |     fn('h_shared', fn('g_shared',fn('f_shared')*2)+fn('f_shared'))+
254 |     fn('h_dyn_shared_c', fn('h_dyn_shared', fn('g_dyn_shared',fn('f_dyn_shared')*2)+fn('f_dyn_shared'))))*3
255 | )
256 | 
257 | # the idea was to test tail call support, incl cases when we tail-call a function excluded
258 | # from tracing; the result of this testing was to dump tail call support since it's not clear
259 | # how to do it, at least without runtime overhead.
260 | tailcall_clean_ref = (fn('tail_caller', fn('callee')) + fn('tail_caller_untraced'))*3
261 | tailcall_dirty_ref = (fn('tail_caller') + fn('callee') + fn('tail_caller_untraced'))*3
262 | 
263 | killed_main_ref = fn('main', fn('g', fn('f')))
264 | killed_children_ref = fn('g', fn('f'))
265 | 
266 | # short_function and long_but_blacklisted should be filtered out
267 | asm_filter_ref = fn('short_but_whitelisted') + fn('long_enough_function') + fn('short_with_loop')
268 | # another build for the same code - without the whitelist & blacklist and with loops ignored
269 | asm_filter_2_ref = fn('long_enough_function') + fn('long_but_blacklisted')
270 | 
271 | c_ref = fn('g', fn('f')*2)
272 | 
273 | freq_ref = [
274 |     (call,'usleep_1500'),
275 |     (ret,'usleep_1500'),
276 | ]
277 | 
278 | class funinfo:
279 |     def __init__(self,line,t):
280 |         self.line = line
281 |         self.count = int(t[0])
282 |         self.module = t[4]
283 |         self.file, self.line = t[5].split(':')
284 | class symcount:
285 |     def __init__(self, lines):
286 |         self.lines = [(line,line.split()) for line in lines]
287 |     def info(self,func):
288 |         for line,t in self.lines:
289 |             if func in line:
290 |                 return funinfo(line,t)
291 |         return funinfo('none',[0,0,0,'??','??:0'])
292 | 
293 | def parse_symcount_txt(f):
294 |     return symcount(open(f).read().strip().split('\n'))
295 | 
296 | def check_count_results(symcount_txt):
297 |     counts = parse_symcount_txt(symcount_txt)
298 |     iters = 1000 
299 |     for name,c in [('f',iters*9),('g',iters*3),('h',iters*3)]:
300 |         fname = name+'()'
301 |         info = counts.info(fname)
302 |         assert info.count == c, f'wrong count for {fname}: expected {c}, got {info.count}'
303 |         assert 'count.cpp' in info.file
304 |         assert '/count' in info.module
305 | 
306 |         fname = name+'_shared()'
307 |         info = counts.info(fname)
308 |         assert info.count == c, f'wrong count for {fname}: expected {c}, got {info.count}'
309 |         assert 'count_shared.cpp' in info.file
310 |         assert '.so' in info.module
311 | 
312 |         fname = name+'_dyn_shared()'
313 |         info = counts.info(fname)
314 |         assert info.count == c, f'wrong count for {fname}: expected {c}, got {info.count}'
315 |         assert 'count_dyn_shared.cpp' in info.file
316 |         assert '.so' in info.module
317 | 
318 | def check_ftrace(ftrace, threads):
319 |     lines = ftrace.strip().split('\n')
320 | 
321 |     # we renamed 2 threads, one to "parent" and one to "child"
322 |     renames = [line for line in lines if 'task_rename' in line]
323 |     assert len(renames)==2
324 |     assert 'newcomm=parent' in '\n'.join(renames)
325 |     assert 'newcomm=child' in '\n'.join(renames)
326 | 
327 |     # parent and child both slept, we expect them to have been awakened
328 |     assert 'sched_waking: comm=parent' in ftrace
329 |     assert 'sched_waking: comm=child' in ftrace
330 | 
331 |     # find the sleeping periods of parent & child
332 |     def time(line):
333 |         t = line.split()[3]
334 |         assert t.endswith(':')
335 |         return float(t[:-1])*10**6
336 | 
337 |     # check that the threads slept, and that the sleep() duration recorded
338 |     # by funtrace contains the not-runnable duration recorded by ftrace
339 |     # (this checks timestamp synchronization)
340 |     for thread in ['parent','child']:
341 |         start = None
342 |         finish = None
343 |         for line in reversed(lines):
344 |             if f'sched_waking: comm={thread}' in line:
345 |                 finish = time(line)
346 |             elif finish is not None and f'sched_switch: prev_comm={thread}' in line:
347 |                 start = time(line)
348 |                 break
349 |         assert start is not None and finish is not None
350 |         print('thread', thread, 'slept for', finish-start)
351 |         assert finish-start >= 150000
352 | 
353 |         func_start = None
354 |         func_finish = None
355 |         for what, func, when in threads[thread]:
356 |             if func.startswith('sleep'):
357 |                 if what == call:
358 |                     func_start = when
359 |                 else:
360 |                     func_finish = when
361 |                     break
362 |         assert func_start is not None and func_finish is not None
363 |         print('  in sleep() for', func_finish - func_start)
364 |         assert start > func_start and finish < func_finish
365 | 
366 | def system(cmd):
367 |     print('running',cmd)
368 |     status = os.system(cmd)
369 |     if 'killed' not in cmd: # we have a test that kills itself with SIGKILL - other than that commands shouldn't fail
370 |         assert status==0, f'`{cmd}` failed with status {status}'
371 | 
372 | BUILDDIR = './built-tests'
373 | OUTDIR = './out'
374 | TARGET = 'x86_64-unknown-linux-gnu'
375 | 
376 | def build_trace_analysis_tools():
377 |     system(f'RUSTFLAGS="-C target-feature=+crt-static" cargo build -r --target {TARGET}')
378 | 
379 | def run_cmds(cmds):
380 |     for cmd in cmds:
381 |         system(cmd)
382 | 
383 | def build_cxx_test(main, shared=[], dyn_shared=[], flags=''):
384 |     cmdlists = []
385 |     binaries = {}
386 |     for mode in ['fi-gcc','fi-clang','pg','xray']:
387 |         CXXFLAGS=f"-O3 -std=c++11 -Wall {flags}"
388 |         if mode == 'xray':
389 |             CXXFLAGS += " -fxray-instruction-threshold=1"
390 |         compiler = {
391 |            'fi-gcc':'finstr-g++',
392 |            'fi-clang':'finstr-clang++',
393 |            'pg':'pg-g++',
394 |            'xray':'xray-clang++',
395 |         }
396 |         CXX = f'./compiler-wrappers/funtrace-{compiler[mode]}'
397 |         test = main.split('.')[0]
398 |         binary = f'{BUILDDIR}/{test}.{mode}'
399 |         cmds = []
400 |         LIBS = ''
401 |         DYNLIBS = ''
402 |         if shared or dyn_shared:
403 |             for cpp in shared+dyn_shared:
404 |                 module = cpp.split('.')[0]
405 |                 lib = f'{os.path.realpath(BUILDDIR)}/{module}.{mode}.so'
406 |                 cmds += [
407 |                     f'{CXX} -c tests/{cpp} -o {BUILDDIR}/{module}.{mode}.o {CXXFLAGS} -I. -fPIC',
408 |                     f'{CXX} -o {lib} {BUILDDIR}/{module}.{mode}.o {CXXFLAGS} -fPIC -shared',
409 |                 ]
410 |                 if cpp in dyn_shared:
411 |                     DYNLIBS += ' '+lib
412 |                 else:
413 |                     LIBS += ' '+lib
414 |         dlibs = ''
415 |         if LIBS:
416 |             dlibs = f'-DLIBS=\\"{DYNLIBS.strip()}\\"'
417 |         cmds += [
418 |             f'{CXX} -c tests/{main} -o {BUILDDIR}/{test}.{mode}.o {CXXFLAGS} -I. {dlibs}',
419 |             f'{CXX} -o {binary} {BUILDDIR}/{test}.{mode}.o {CXXFLAGS}{LIBS}',
420 |         ]
421 |         cmdlists.append(cmds)
422 |         binaries.setdefault(test,list()).append(binary)
423 |     return cmdlists, binaries
424 | 
425 | def run_cxx_test(test, binaries):
426 |     cmdlists = []
427 |     for binary in binaries:
428 |         name = os.path.basename(binary)
429 |         env = ''
430 |         if 'xray' in binary:
431 |             env = 'env XRAY_OPTIONS="patch_premain=true"' 
432 |         cmds = [
433 |             f'mkdir -p {OUTDIR}/{name}',
434 |             f'cd {OUTDIR}/{name}; {env} ../../{binary}',
435 |         ]
436 |         if 'count' in test:
437 |             cmds += [
438 |                 f'./target/{TARGET}/release/funcount2sym {OUTDIR}/{name}/funcount.txt | c++filt > {OUTDIR}/{name}/symcount.txt'
439 |             ]
440 |         else:
441 |             cmds += [
442 |                 f'./target/{TARGET}/release/funtrace2viz {OUTDIR}/{name}/funtrace.raw {OUTDIR}/{name}/funtrace > {OUTDIR}/{name}/f2v.out'
443 |             ]
444 |         cmdlists.append(cmds)
445 |     return cmdlists
446 | 
447 | 
448 | def main():
449 |     global pool
450 |     pool = Pool()
451 |     build_trace_analysis_tools()
452 |     system(f'rm -rf {BUILDDIR}')
453 |     system(f'rm -rf {OUTDIR}')
454 |     system(f'mkdir -p {BUILDDIR}')
455 | 
456 |     cmdlists = []
457 |     test2bins = {}
458 |     def buildcmds(*args,**kw):
459 |         c,b = build_cxx_test(*args,**kw)
460 |         cmdlists.extend(c)
461 |         test2bins.update(b)
462 | 
463 |     buildcmds('ignore_disable.cpp')
464 |     buildcmds('exceptions.cpp')
465 |     buildcmds('untraced_catcher.cpp')
466 |     buildcmds('untraced_funcs.cpp')
467 |     buildcmds('longjmp.cpp')
468 |     buildcmds('tailcall.cpp')
469 |     buildcmds('orphans.cpp')
470 |     buildcmds('buf_size.cpp')
471 |     buildcmds('benchmark.cpp',flags=f'-funtrace-no-trace={os.path.realpath("tests/no-trace-bench.txt")}')
472 |     buildcmds('freq.cpp')
473 |     buildcmds('killed.cpp')
474 |     buildcmds('sigtrap.cpp')
475 |     buildcmds('ftrace.cpp')
476 |     buildcmds('asm_filter.cpp',flags=f'-funtrace-instr-thresh=20 -funtrace-no-trace={os.path.realpath("tests/no-trace.txt")} -funtrace-do-trace={os.path.realpath("tests/do-trace.txt")}')
477 |     buildcmds('asm_filter_2.cpp',flags=f'-funtrace-instr-thresh=20 -funtrace-ignore-loops')
478 |     buildcmds('shared.cpp',shared=['lib_shared.cpp'],dyn_shared=['lib_dyn_shared.cpp'])
479 |     buildcmds('count.cpp',shared=['count_shared.cpp'],dyn_shared=['count_dyn_shared.cpp'],flags='-DFUNTRACE_FUNCOUNT -DFUNCOUNT_PAGE_TABLES=2')
480 |     buildcmds('c.c')
481 |     pool.map(run_cmds, cmdlists)
482 | 
483 |     cmdlists = []
484 |     killedcmds = []
485 |     for test,binaries in test2bins.items():
486 |         cmds = killedcmds if 'killed' in test else cmdlists # we run killed later
487 |         cmds.extend(run_cxx_test(test,binaries))
488 | 
489 |     pool.map(run_cmds, cmdlists)
490 |     check()
491 | 
492 |     pool.map(run_cmds, killedcmds)
493 |     for binary in test2bins['killed']:
494 |         if 'xray' in binary or 'clang' in binary:
495 |             continue # my gdb is too old to parse the latest LLVM's DWARF; there's no better reason for this condition...
496 |         check_funtrace_from_core_dump(binary)
497 |     check_orphan_tracer_removal()
498 | 
499 | jsonmod = json
500 | def check():
501 |     print('checking results...')
502 | 
503 |     def load_threads(json):
504 |         return parse_perfetto_json(json)['threads']
505 |     def load_thread(json):
506 |         return list(load_threads(json).values())[0]
507 |     def load_ftrace(json):
508 |         return jsonmod.load(open(json))['systemTraceEvents']
509 | 
510 |     def jsons(test): return sorted(glob.glob(f'{OUTDIR}/{test}.*/funtrace.json'))
511 | 
512 |     # funtrace tests [except freq]
513 |     for json in jsons('ignore_disable'):
514 |         print('checking',json)
515 |         threads = load_threads(json)
516 |         assert len(threads) == 3
517 |         for name,thread in threads.items():
518 |             if name in ['child1','child3']:
519 |                 assert verify_thread(thread, ignore_disable_child_ref)
520 |             elif name == 'main':
521 |                 assert verify_thread(thread, ignore_disable_main_ref)
522 |             else:
523 |                 assert False, f'unexpected thread name: {name}'
524 |     for json in jsons('exceptions'):
525 |         print('checking',json)
526 |         assert verify_thread(load_thread(json), exceptions_ref)
527 |     for json in jsons('untraced_catcher'):
528 |         print('checking',json)
529 |         ref = clean_untraced_caller_ref if 'fi-gcc' in json else (dirty_untraced_caller_ref if 'xray' not in json else dirty_untraced_catcher_xray_ref)
530 |         assert verify_thread(load_thread(json), ref)
531 |     for json in jsons('untraced_funcs'): 
532 |         print('checking',json)
533 |         assert verify_thread(load_thread(json), untraced_funcs_ref)
534 |     for json in jsons('longjmp'): 
535 |         print('checking',json)
536 |         assert verify_thread(load_thread(json), longjmp_ref)
537 |     for json in jsons('tailcall'):
538 |         print('checking',json)
539 |         assert verify_thread(load_thread(json), tailcall_clean_ref if 'fi-' in json else tailcall_dirty_ref)
540 |     for json in jsons('orphans'): 
541 |         print('checking',json)
542 |         assert verify_thread(load_thread(json), orphans_ref(json))
543 |     for json in jsons('buf_size'): 
544 |         print('checking',json)
545 |         threads = load_threads(json)
546 |         assert verify_thread(threads['event_buf_1'], buf_size_ref)
547 |         num_f_calls = len([name for _,name,_ in threads['event_buf_16'] if name.startswith('f()')])
548 |         assert num_f_calls <= 16*2 and num_f_calls >= 14*2, f'wrong number of f calls: {num_f_calls}'
549 |     for json in jsons('sigtrap'):
550 |         print('checking',json)
551 |         thread = load_thread(json)
552 |         assert len([name for _,name,_ in thread if name.startswith('traced_func')]) >= 100
553 |     for json in jsons('shared'):
554 |         print('checking',json)
555 |         for thread in load_threads(json).values():
556 |             assert verify_thread(thread, shared_ref)
557 |     for json in jsons('asm_filter'):
558 |         print('checking',json)
559 |         if 'xray' not in json: # we don't support asm filtering for XRay
560 |             assert verify_thread(load_thread(json), asm_filter_ref)
561 |     for json in jsons('ftrace'):
562 |         print('checking',json)
563 |         ftrace = load_ftrace(json)
564 |         threads = load_threads(json)
565 |         check_ftrace(ftrace, threads)
566 |     for json in jsons('c'):
567 |         print('checking',json)
568 |         assert verify_thread(load_thread(json), c_ref)
569 | 
570 |     # funcount test
571 |     for symcount_txt in sorted(glob.glob(f'{OUTDIR}/count.*/symcount.txt')):
572 |         print('checking',symcount_txt)
573 |         check_count_results(symcount_txt)
574 | 
575 |     # check last... might fail intermittently because we sleep for more than we asked for
576 |     # due to the machine being loaded or whatever
577 |     for json in jsons('freq'):
578 |         print('checking',json)
579 |         t = load_thread(json)
580 |         assert verify_thread(t, freq_ref)
581 |         slept = t[1][-1]-t[0][-1]
582 |         assert slept >= 1500 and slept < 1700, f'wrong sleeping time {slept}'
583 | 
584 | def check_funtrace_from_core_dump(test):
585 |     testdir = f'{OUTDIR}/{os.path.basename(test)}'
586 |     # the test produces an empty trace with no samples to extract to funtrace.json
587 |     tracejson = f'{testdir}/funtrace.json'
588 |     assert not os.path.exists(tracejson)
589 |     assert os.path.exists(f'{testdir}/core'), f'{testdir}/core not found - is your /proc/sys/kernel/core_pattern set to "core", and is core dump size unlimited in the shell?'
590 | 
591 |     system(f'cd {testdir} && gdb -q ../../{test} core -x ../../funtrace_gdb.py -ex funtrace -ex quit')
592 |     system(f'./target/{TARGET}/release/funtrace2viz {testdir}/funtrace.raw {testdir}/funtrace')
593 | 
594 |     # core dump analysis should produce a sample that will be extraced to funtrace.json
595 |     assert os.path.exists(tracejson)
596 | 
597 |     data = parse_perfetto_json(tracejson)
598 |     threads = data['threads']
599 |     ftrace = data['systemTraceEvents']
600 |     assert 'sched_waking: comm=child', f'bad ftrace data:\n{ftrace}'
601 |     
602 |     # check that both the active and the recently finished thread were found
603 |     assert len(threads) == 3
604 |     assert 'child' in threads
605 |     for thread in threads.values():
606 |         is_main = len([name for _,name,_ in thread if name.startswith('main')]) > 0
607 |         if is_main:
608 |             # we're checking, in particular, that after saving a snapshot we don't have "noise" trace entries from funtrace itself
609 |             thread = [(what,name,when) for what,name,when in thread if '_GLOBAL__' not in name and '__static_initialization_and_destruction' not in name]
610 |         ref = killed_main_ref if is_main else killed_children_ref
611 |         assert verify_thread(thread, ref)
612 | 
613 | def check_orphan_tracer_removal():
614 |     def funtrace_pid(s):
615 |         try:
616 |             t = s.split('.')
617 |             assert len(t) == 2 and t[0]=='funtrace'
618 |             return int(t[1])
619 |         except:
620 |             return 0
621 |     def find_tracers():
622 |         return [f for f in glob.glob('/sys/kernel/tracing/instances/funtrace.*') if funtrace_pid(os.path.basename(f))]
623 |     tracers = find_tracers()
624 |     assert len(tracers) >= 4, f'expected at least 4 funtrace ftrace instances, found {len(tracers)}: {tracers}'
625 |     print('\n'.join(['orphan tracer instances:']+tracers))
626 | 
627 |     # could be any funtrace-instrumented program - they clean orphan tracer dirs upon exit
628 |     system(f'cd out/benchmark.pg; ../../{BUILDDIR}/benchmark.pg')
629 |     for t in tracers:       
630 |         pid = funtrace_pid(os.path.basename(t))
631 |         # either the PID exists or the tracer was removed by the run of benchmark.pg
632 |         assert os.path.exists('/proc/%d'%pid) or not os.path.exists(t)
633 | 
634 |     tracers = find_tracers()
635 |     print('\n'.join(['orphan tracer instances:']+tracers))
636 | 
637 | if __name__ == '__main__':
638 |     main()
639 | 


--------------------------------------------------------------------------------
/funtrace2viz/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::File;
  2 | use std::io::{self, Read, Seek, SeekFrom};
  3 | use std::io::prelude::*;
  4 | use std::mem;
  5 | use bytemuck::{Pod, Zeroable};
  6 | use std::collections::{HashMap, HashSet};
  7 | use procaddr2sym::{ProcAddr2Sym, SymInfo};
  8 | use serde_json::Value;
  9 | use clap::Parser;
 10 | use std::cmp::{min, max};
 11 | use num::{FromPrimitive, Zero};
 12 | use num::rational::Ratio;
 13 | use num::bigint::BigInt;
 14 | 
 15 | const RETURN_BIT: i32 = 63;
 16 | const RETURN_WITH_CALLER_ADDRESS_BIT: i32 = 62;
 17 | const CATCH_MASK: u64 = (1<<RETURN_BIT)|(1<<RETURN_WITH_CALLER_ADDRESS_BIT);
 18 | const CALL_RETURNING_UPON_THROW_BIT: i32 = 61;
 19 | const ADDRESS_MASK: u64 = !(CATCH_MASK | (1<<CALL_RETURNING_UPON_THROW_BIT));
 20 | const MAGIC_LEN: usize = 8;
 21 | const LENGTH_LEN: usize = 8;
 22 | 
 23 | fn bit_set(n: u64, b: i32) -> bool { ((n>>b)&1) != 0 }
 24 | 
 25 | // Struct to represent a 16-byte FUNTRACE entry
 26 | #[repr(C)]
 27 | #[derive(Debug, Pod, Zeroable, Clone, Copy)]
 28 | struct FunTraceEntry {
 29 |     address: u64,
 30 |     cycle: u64,
 31 | }
 32 | 
 33 | struct SourceCode {
 34 |     json_str: String,
 35 |     num_lines: usize,
 36 | }
 37 | 
 38 | #[derive(Parser)]
 39 | #[clap(about="convert funtrace.raw to JSON files in the viztracer/vizviewer format (pip install viztracer; or use Perfetto but then you won't see source code)", version)]
 40 | struct Cli {
 41 |     #[clap(help="funtrace.raw input file with one or more trace samples")]
 42 |     funtrace_raw: String,
 43 |     #[clap(help="basename.json, basename.1.json, basename.2.json... are created, one JSON file per trace sample")]
 44 |     out_basename: String,
 45 |     #[clap(short, long, help="print the static addresses and executable/shared object files of decoded functions in addition to name, file & line")]
 46 |     executable_file_info: bool,
 47 |     #[clap(short, long, help="print the raw timestamps (the default is to subtract the timestamp of the earliest reported event at each sample, so that time starts at 0; in particular it helps to avoid rounding issues you might see with large timestamp values)")]
 48 |     raw_timestamps: bool,
 49 |     #[clap(short, long, help="ignore events older than this relatively to the latest recorded event in a given trace sample (very old events create the appearance of a giant blank timeline in vizviewer/Perfetto which zooms out to show the recorded timeline in full)")]
 50 |     max_event_age: Option<u64>,
 51 |     #[clap(short, long, help="ignore events older than this cycle (like --max-event-age but as a timestamp instead of an age in cycles)")]
 52 |     oldest_event_time: Option<u64>,
 53 |     #[clap(short, long, help="dry run - only list the samples & threads with basic stats, don't decode into JSON")]
 54 |     dry: bool,
 55 |     #[clap(short, long, help="ignore samples with indexes outside this list")]
 56 |     samples: Vec<u32>,
 57 |     #[clap(short, long, help="ignore threads with TIDs outside this list (including for the purpose of interpreting --max-event-age)")]
 58 |     threads: Vec<u64>,
 59 | }
 60 | 
 61 | struct TraceConverter {
 62 |     procaddr2sym: ProcAddr2Sym,
 63 |     // we dump source code into the JSON files to make it visible in vizviewer
 64 |     source_cache: HashMap<String, SourceCode>,
 65 |     sym_cache: HashMap<u64, SymInfo>,
 66 |     max_event_age: Option<u64>,
 67 |     raw_timestamps: bool,
 68 |     time_base: u64,
 69 |     oldest_event_time: Option<u64>,
 70 |     dry: bool,
 71 |     samples: Vec<u32>,
 72 |     threads: Vec<u64>,
 73 |     cpu_freq: u64,
 74 |     cmd_line: String,
 75 |     first_event_in_json: bool,
 76 |     first_event_in_thread: bool,
 77 |     num_events: i64,
 78 | }
 79 | 
 80 | #[repr(C)]
 81 | #[derive(Debug, Pod, Zeroable, Clone, Copy)]
 82 | struct ThreadID
 83 | {
 84 |     pid: u64,
 85 |     tid: u64,
 86 |     name: [u8; 16],
 87 | }
 88 | 
 89 | struct ThreadTrace {
 90 |     thread_id: ThreadID,
 91 |     trace: Vec<FunTraceEntry>,
 92 | }
 93 | 
 94 | struct FtraceEvent {
 95 |     timestamp: u64,
 96 |     line: String,
 97 | }
 98 | 
 99 | fn parse_ftrace_lines(input: &String, transform_timestamp: impl Fn(u64) -> String) -> Vec<FtraceEvent> {
100 |     let mut results = Vec::new();
101 | 
102 |     for line in input.lines() {
103 |         // Find the timestamp section
104 |         if let Some(colon_pos) = line.find(": ") {
105 |             // Search backwards from colon to find the start of timestamp
106 |             if let Some(space_before_ts) = line[..colon_pos].rfind(char::is_whitespace) {
107 |                 let timestamp_str = &line[space_before_ts + 1..colon_pos];
108 | 
109 |                 // Parse the timestamp
110 |                 if let Ok(timestamp) = timestamp_str.parse::<u64>() {
111 |                     // Split line into parts
112 |                     let before_ts = &line[..space_before_ts + 1];
113 |                     let after_ts = &line[colon_pos..];
114 | 
115 |                     // Create modified line with transformed timestamp
116 |                     let modified_line = format!(
117 |                         "{}{}{}",
118 |                         before_ts,
119 |                         transform_timestamp(timestamp),
120 |                         after_ts
121 |                     );
122 | 
123 |                     results.push(FtraceEvent {
124 |                         timestamp,
125 |                         line: modified_line,
126 |                     });
127 |                 }
128 |             }
129 |         }
130 |     }
131 | 
132 |     results
133 | }
134 | 
135 | fn rat2dec(rat: &Ratio<BigInt>, decimal_places: u32) -> String {
136 |     let mut result = "".to_string();
137 |     let mut rational = rat.clone();
138 |     if rat < &Ratio::from_u64(0).unwrap() { //shouldn't happen in this program but let's print correctly if it does
139 |         rational = -rat;
140 |         result = "-".to_string();
141 |     }
142 |     // Round - add 0.0..05
143 |     let rounded = rational + Ratio::from_u64(5).unwrap() / Ratio::from_u64(10u64.pow(decimal_places+1)).unwrap();
144 | 
145 |     // Get numerator and denominator
146 |     let numerator = rounded.numer();
147 |     let denominator = rounded.denom();
148 |     
149 |     // Perform division with extra precision to ensure accuracy
150 |     let mut quotient = numerator / denominator;
151 |     let mut remainder = numerator % denominator;
152 |     
153 |     // Build the decimal string
154 |     result = result + &quotient.to_string();
155 |     
156 |     if !remainder.is_zero() {
157 |         result.push('.');
158 |         
159 |         // Calculate decimal digits
160 |         for _ in 0..decimal_places {
161 |             remainder *= 10;
162 |             quotient = &remainder / denominator;
163 |             remainder = &remainder % denominator;
164 |             result.push_str(&quotient.to_string());
165 |             
166 |             if remainder.is_zero() {
167 |                 break;
168 |             }
169 |         }
170 |     }
171 |     
172 |     result
173 | }
174 | 
175 | impl TraceConverter {
176 |     pub fn new(args: &Cli) -> Self {
177 |         TraceConverter { procaddr2sym: ProcAddr2Sym::new(), source_cache: HashMap::new(), sym_cache: HashMap::new(),
178 |             max_event_age: args.max_event_age, raw_timestamps: args.raw_timestamps, time_base: 0,
179 |             oldest_event_time: args.oldest_event_time, dry: args.dry,
180 |             samples: args.samples.clone(), threads: args.threads.clone(), cpu_freq: 0, cmd_line: "".to_string(),
181 |             first_event_in_json: false, first_event_in_thread: false, num_events: 0
182 |         }
183 |     }
184 | 
185 |     fn oldest_event(&self, sample_entries: &Vec<ThreadTrace>, ftrace_events: &Vec<FtraceEvent>) -> u64 {
186 |         let mut youngest = 0;
187 |         let mut oldest = u64::MAX;
188 |         for entries in sample_entries {
189 |             if self.threads.is_empty() || self.threads.contains(&entries.thread_id.tid) {
190 |                 oldest = min(entries.trace.first().unwrap().cycle, oldest);
191 |                 youngest = max(entries.trace.last().unwrap().cycle, youngest);
192 |             }
193 |         }
194 |         if !ftrace_events.is_empty() {
195 |             oldest = min(ftrace_events.first().unwrap().timestamp, oldest);
196 |             youngest = max(ftrace_events.last().unwrap().timestamp, youngest);
197 |         }
198 |         if let Some(max_age) = self.max_event_age {
199 |             youngest - max_age
200 |         }
201 |         else if let Some(oldest_to_report) = self.oldest_event_time {
202 |             oldest_to_report
203 |         }
204 |         else {
205 |             oldest
206 |         }
207 |     }
208 | 
209 |     //extra_ns shifts the return timestamp if positive or the call timestamp if negative
210 |     fn write_function_call_event(&mut self, json: &mut File, call_sym: &SymInfo, call_cycle: u64, return_cycle: u64, extra_ns: i32, thread_id: &ThreadID, funcset: &mut HashSet<SymInfo>) -> io::Result<()> {
211 |         self.num_events += 1;
212 |         if self.dry {
213 |             return Ok(());
214 |         }
215 |         if self.first_event_in_thread {
216 |             let name: Vec<_> = thread_id.name.iter().filter(|&&x| x != 0 as u8).copied().collect();
217 |             json.write(format!(r#"{}{{"ph":"M","pid":{},"tid":{},"name":"thread_name","args":{{"name":{}}}}}"#,
218 |                         if self.first_event_in_json { "" } else { "\n," },
219 |                         thread_id.pid,thread_id.tid,Value::String(String::from_utf8(name).unwrap()).to_string()).as_bytes())?;
220 |             self.first_event_in_thread = false;
221 |             self.first_event_in_json = false;
222 | 
223 |             if thread_id.pid == thread_id.tid {
224 |                 json.write(format!(r#"{}{{"ph":"M","pid":{},"tid":{},"name":"process_name","args":{{"name":{}}}}}"#, "\n,",
225 |                         thread_id.pid,thread_id.tid,Value::String(self.cmd_line.clone()).to_string()).as_bytes())?;
226 |             }
227 |         }
228 |         //using f64 would lose precision for machines with an uptime > month since f64 stores
229 |         //52 mantissa bits and TSC increments a couple billion times per second.
230 |         //we use rational numbers instead
231 |         let rat = |n: u64| Ratio::from_u64(n).unwrap();
232 |         let cycles_per_us = rat(self.cpu_freq) / rat(1000000);
233 | 
234 |         let (extra_ret, extra_call) = if extra_ns > 0 {
235 |             (rat(extra_ns as u64) / rat(1000), rat(0))
236 |         }
237 |         else {
238 |             (rat(0), rat(-extra_ns as u64) / rat(1000))
239 |         };
240 | 
241 |         let digits = 4; //Perfetto timeline has nanosecond precision - no point in printing
242 |         //more digits than 3 for the microsecond timestamps it expects in the JSON; we print 4
243 |         //for testing to make sure that cycles don't round to the same ns that should be distinct events
244 | 
245 |         if return_cycle != 0 && call_cycle != 0 { // a "complete" event (ph:X); these needn't be sorted by timestamp
246 |             //note that we could have used the B and E events for "incomplete" function calls missing a call
247 |             //or a return timestamp. however, the last orphan B event seems to be missing from Perfetto's rendering
248 |             //and all of the orphan E events seem to be missing; B and E are apparently mostly designed to come in pairs
249 |             //(despite the beautiful gradient that orphan B events are rendered with)
250 |             json.write(format!(r#"{}{{"tid":{},"ts":{},"dur":{},"name":{},"ph":"X","pid":{}}}"#, "\n,",
251 |                         thread_id.tid,
252 |                         rat2dec(&(rat(call_cycle-self.time_base)/cycles_per_us.clone() - extra_call.clone()), digits),
253 |                         rat2dec(&(rat(return_cycle-call_cycle)/cycles_per_us + extra_call + extra_ret), digits),
254 |                         json_name(call_sym), thread_id.pid).as_bytes())?; 
255 |         }    
256 | 
257 |         funcset.insert(call_sym.clone());
258 |     
259 |         //cache the source code if it's the first time we see this file
260 |         if !self.source_cache.contains_key(&call_sym.file) {
261 |             let mut source_code: Vec<u8> = Vec::new();
262 |             if let Ok(mut source_file) = File::open(&call_sym.file) {
263 |                 source_file.read_to_end(&mut source_code)?;
264 |             }
265 |             else if call_sym.file != "??" {
266 |                 println!("WARNING: couldn't open source file {} - you can remap paths using a substitute-path.json file in your working directory", call_sym.file);
267 |             }
268 |             let json_str = Value::String(String::from_utf8(source_code.clone()).unwrap()).to_string();
269 |             let num_lines = source_code.iter().filter(|&&b| b == b'\n').count(); //TODO: num newlines
270 |             //might be off by one relatively to num lines...
271 |             self.source_cache.insert(call_sym.file.clone(), SourceCode{ json_str, num_lines });
272 |         }
273 |         Ok(())
274 |     }
275 | 
276 |     fn write_sample_to_json(&mut self, fname: &String, sample_entries: &Vec<ThreadTrace>, ftrace_text: &String) -> io::Result<()> {
277 |         let mut json = if self.dry { File::open("/dev/null")? } else { File::create(fname)? };
278 |         if !self.dry {
279 |             json.write(br#"{
280 | "traceEvents": [
281 | "#)?;
282 |             println!("decoding a trace sample logged by `{}` into {} ...", self.cmd_line, fname);
283 |         }
284 |         else {
285 |             println!("inspecting sample {} logged by `{}` (without creating the file...)", fname, self.cmd_line);
286 |         }
287 |     
288 |         // we list the set of functions (to tell their file, line pair to vizviewer);
289 |         // we also use this set to only dump the relevant part of the source cache to each
290 |         // json (the source cache persists across samples/jsons but not all files are relevant
291 |         // to all samples)
292 |         let mut funcset: HashSet<SymInfo> = HashSet::new();
293 |         self.first_event_in_json = true;
294 |         let mut ignore_addrs: HashSet<u64> = HashSet::new();
295 |     
296 |         let rat = |n: u64| Ratio::from_u64(n).unwrap();
297 |         //ftrace timestamps are supposed to be in seconds; CPU frequency is in TSC cycles per second;
298 |         //so dividing by frequency will convert TSC to seconds. Perfetto timeline accuracy is ns
299 |         //hence 10 digits after '.' (9 plus another to make sure different cycles don't become the same ns)
300 |         let cycles_per_second = rat(self.cpu_freq);
301 |         let fixts = |ts: u64| format!("{}", rat2dec(&(rat(ts)/cycles_per_second.clone()), 10));
302 |         let mut ftrace_events = parse_ftrace_lines(ftrace_text, fixts);
303 | 
304 |         let oldest = self.oldest_event(sample_entries, &ftrace_events);
305 |         self.time_base = if self.raw_timestamps { 0 } else { oldest };
306 | 
307 |         if self.time_base > 0 {
308 |             //TODO: a bit wasteful to reparse this just to subtract the time base 
309 |             let fixts = |ts: u64| format!("{}", rat2dec(&(rat(ts-self.time_base)/cycles_per_second.clone()), 10));
310 |             ftrace_events = parse_ftrace_lines(ftrace_text, fixts);
311 |         }
312 |     
313 |         ftrace_events.retain(|event| event.timestamp >= oldest);
314 | 
315 |         for thread_trace in sample_entries {
316 |             let entries = &thread_trace.trace;
317 |             if !self.threads.is_empty() && !self.threads.contains(&thread_trace.thread_id.tid) {
318 |                 println!("ignoring thread {} - not on the list {:?}", thread_trace.thread_id.tid, self.threads);
319 |                 continue;
320 |             }
321 |             let mut stack: Vec<FunTraceEntry> = Vec::new();
322 |             self.num_events = 0;
323 |             let earliest_cycle = max(entries[0].cycle, oldest);
324 |             let latest_cycle = entries[entries.len()-1].cycle;
325 |             let mut num_orphan_returns = 0;
326 |             self.first_event_in_thread = true;
327 | 
328 |             let mut expecting_to_return_into_sym = self.procaddr2sym.unknown_symbol();
329 |     
330 |             for entry in entries {
331 |                 if oldest > entry.cycle {
332 |                     continue; //ignore old events
333 |                 }
334 |                 let catch = (entry.address & CATCH_MASK) == CATCH_MASK;
335 |                 let ret_with_caller_addr = bit_set(entry.address, RETURN_WITH_CALLER_ADDRESS_BIT) && !catch;
336 |                 let ret = (bit_set(entry.address, RETURN_BIT) || ret_with_caller_addr) && !catch;
337 |                 let addr = entry.address & ADDRESS_MASK;
338 | 
339 |                 if !self.sym_cache.contains_key(&addr) {
340 |                     let sym = self.procaddr2sym.proc_addr2sym(addr);
341 |                     //we ignore "virtual override thunks" because they aren't interesting
342 |                     //to the user, and what's more, some of them call __return__ but not
343 |                     //__fentry__ under -pg, so you get spurious "orphan returns" (see below
344 |                     //how we handle supposedly "real" orphan returns.)
345 |                     if sym.demangled_func.contains("virtual override thunk") {
346 |                         ignore_addrs.insert(addr);
347 |                     }
348 |                     self.sym_cache.insert(addr, sym);
349 |                 }
350 |                 if ignore_addrs.contains(&addr) {
351 |                     continue;
352 |                 }
353 |                 //println!("{} {} sym {}", stack.len(), if catch { "catch" } else if ret { "ret" } else { "call" }, json_name(self.sym_cache.get(&addr).unwrap()));
354 |                 if catch {
355 |                     //pop the entries on the stack until we find the function which logged the catch entry.
356 |                     //if we don't find it, perhaps its call entry didn't make it into our trace, or, more
357 |                     //troublingly, it was compiled without instrumentation or something else went wrong which
358 |                     //will cause us to pop everything from the stack. but resetting the stack upon a catch
359 |                     //is probably less bad than leaving it as is since then it would keep growing with
360 |                     //every catch
361 |                     //
362 |                     //TODO: we could probably improve the handling of "uninstrumented catchers" by keeping
363 |                     //a history of the fully-popped stacks and then when a return arrives of a function
364 |                     //in one of these stacks that was "orphaned" by the throw/catch, we could find its call
365 |                     //entry in this history and reconstruct the call sequence. this could be done given demand;
366 |                     //ATM we just advise against compiling "catchers" without instrumentation. [note that
367 |                     //the improvement above would work some of the time but not always, eg because a return
368 |                     //of any of the catcher's caller wasn't traced, either because it didn't happen or
369 |                     //because the callers of the catcher were also uninstrumented - and this isn't a far-fetched
370 |                     //scenario, eg if you have some loop with the top-level code catching exceptions,
371 |                     //it might be running "indefinitely" so you won't see a return that would trigger the
372 |                     //logic above. so advising against uninstrumented catchers
373 |                     //will remain valid even if we add all the logic described above.]
374 |                     let catcher = self.sym_cache.get(&addr).unwrap().demangled_func.clone();
375 |                     let mut unwound = 0;
376 |                     while !stack.is_empty() {
377 |                         let last = stack.last().unwrap();
378 |                         if bit_set(last.address, CALL_RETURNING_UPON_THROW_BIT) {
379 |                             //this was traced with -finstrument-functions or "something" that would have
380 |                             //recorded a return event had it been returned from due to stack unwinding
381 |                             break;
382 |                         }
383 |                         let call_sym = self.sym_cache.get(&(last.address & ADDRESS_MASK)).unwrap();
384 |                         if catcher == call_sym.demangled_func { //we don't compare by address since it could be two
385 |                             //different symbols - we entered "f(int)" and we are catching inside "f(int) [clone .cold]";
386 |                             //procaddr2sym strips the [clone...] from the name so we can compare by it
387 |                             break;
388 |                         }
389 |                         //these all end at the same cycle contrary to the JSON spec's perfect nesting requirement;
390 |                         //unlike XRay we try to make them stand apart by 1 ns (the timeline's precision), also makes testing more straightforward
391 |                         unwound += 1;
392 |                         self.write_function_call_event(&mut json, &call_sym.clone(), last.cycle, entry.cycle, unwound, &thread_trace.thread_id, &mut funcset)?;
393 |                         stack.pop();
394 |                     }
395 |                     continue;
396 |                 }
397 |                 if !ret {
398 |                     stack.push(*entry);
399 |                 }
400 |                 else {
401 |                     let ret_sym = self.sym_cache.get(&addr).unwrap().clone();
402 | 
403 |                     if stack.is_empty() { //an "orphan return" - the call wasn't in the trace
404 |                         num_orphan_returns += 1;
405 |                         //if ret_with_caller_addr, record the return into the function we're expecting to return into (might be unknown
406 |                         //or we could know by getting a previous return event with the caller's address)
407 |                         let sym = if ret_with_caller_addr { &expecting_to_return_into_sym } else { &ret_sym };
408 |                         self.write_function_call_event(&mut json, sym, earliest_cycle, entry.cycle, -num_orphan_returns, &thread_trace.thread_id, &mut funcset)?;
409 |                         if ret_with_caller_addr {
410 |                             expecting_to_return_into_sym = ret_sym.clone();
411 |                         }
412 |                         continue;
413 |                     }
414 |                     if ret_with_caller_addr {
415 |                         //this might be useful if we get an orphan return next
416 |                         expecting_to_return_into_sym = ret_sym.clone();
417 |                     }
418 | 
419 |                     let call_entry = stack.pop().unwrap();
420 |                     let mut call_cycle = call_entry.cycle;
421 |                     let mut call_sym = self.sym_cache.get(&(call_entry.address & ADDRESS_MASK)).unwrap().clone();
422 |                     //warn if we return to a different function from the one predicted by the call stack.
423 |                     //this "shouldn't happen" but it does unless we ignore "virtual override thunks"
424 |                     //and it's good to at least emit a warning when it does since the trace will look strange
425 |                     
426 |                     //warn if we're returning to a function different than predicted by the call stack,
427 |                     //and try to recover from the problem by popping from the stack until we find right function
428 |                     //(eg setjmp/longjmp can cause this problem).
429 |                     let mut returns = 0;
430 |                     if !ret_with_caller_addr {
431 |                         //comparing names instead of addresses because of the [clone ...] business - not sure if we can
432 |                         //call one clone and return into another but who knows, certainly catch returns to another clone at times
433 |                         if ret_sym.demangled_func != call_sym.demangled_func {
434 |                             println!("      WARNING: call/return mismatch - {} popped from the stack but {} returning", json_name(&call_sym), json_name(&ret_sym));
435 |                             let mut found = false;
436 |                             while !found {
437 |                                 self.write_function_call_event(&mut json, &call_sym.clone(), call_cycle, entry.cycle, returns, &thread_trace.thread_id, &mut funcset)?;
438 |                                 if stack.is_empty() {
439 |                                     break;
440 |                                 }
441 |                                 let last = stack.last().unwrap();
442 |                                 call_sym = self.sym_cache.get(&(last.address & ADDRESS_MASK)).unwrap().clone();
443 |                                 call_cycle = last.cycle;
444 |                                 println!("        WARNING: popping {}", json_name(&call_sym));
445 |                                 stack.pop();
446 |                                 returns += 1;
447 |                                 found = ret_sym.demangled_func == call_sym.demangled_func;
448 |                             }
449 |                         }
450 |                     }
451 |                     else if !stack.is_empty() {
452 |                         let ret_caller_sym = self.sym_cache.get(&(stack.last().unwrap().address & ADDRESS_MASK)).unwrap();
453 |                         if ret_sym.demangled_func != ret_caller_sym.demangled_func && stack.iter().any(|&entry| self.sym_cache.get(&(entry.address & ADDRESS_MASK)).unwrap().demangled_func == ret_sym.demangled_func) {
454 |                             println!("      WARNING: call/return mismatch - {} called from {}, the returning function's caller is {}", json_name(&call_sym), json_name(ret_caller_sym), json_name(&ret_sym));
455 |                             let mut found = false;
456 |                             while !found {
457 |                                 self.write_function_call_event(&mut json, &call_sym.clone(), call_cycle, entry.cycle, returns, &thread_trace.thread_id, &mut funcset)?;
458 |                                 if stack.is_empty() {
459 |                                     break;
460 |                                 }
461 |                                 let last = stack.last().unwrap();
462 |                                 call_sym = self.sym_cache.get(&(last.address & ADDRESS_MASK)).unwrap().clone();
463 |                                 call_cycle = last.cycle;
464 |                                 println!("        WARNING: popping {}", json_name(&call_sym));
465 |                                 stack.pop();
466 |                                 returns += 1;
467 |                                 found = !stack.is_empty() && ret_sym.demangled_func == self.sym_cache.get(&(stack.last().unwrap().address & ADDRESS_MASK)).unwrap().demangled_func;
468 |                             }
469 |                         }
470 |                     }
471 |                     self.write_function_call_event(&mut json, &call_sym, call_cycle, entry.cycle, returns, &thread_trace.thread_id, &mut funcset)?;
472 |                 }
473 |             }
474 |             //if the stack isn't empty, record a call with a fake return cycle
475 |             let mut fake_returns = stack.len() as i32;
476 |             for entry in &stack {
477 |                  let call_sym = self.sym_cache.get(&(entry.address & ADDRESS_MASK)).unwrap();
478 |                  self.write_function_call_event(&mut json, &call_sym.clone(), entry.cycle, latest_cycle, fake_returns, &thread_trace.thread_id, &mut funcset)?;
479 |                  fake_returns -= 1;
480 |             }
481 |             let name = String::from_utf8(thread_trace.thread_id.name.iter().filter(|&&x| x != 0 as u8).copied().collect()).unwrap();
482 |             if latest_cycle >= earliest_cycle {
483 |                 println!("  thread {} {} - {} recent function calls logged over {} cycles [{} - {}]", thread_trace.thread_id.tid, name, self.num_events, latest_cycle-earliest_cycle, earliest_cycle-self.time_base, latest_cycle-self.time_base);
484 |             }
485 |             else {
486 |                 println!("    skipping thread {} {} (all {} logged function entry/return events are too old)", thread_trace.thread_id.tid, name, entries.len());
487 |             }
488 |         }
489 |         if self.dry {
490 |             return Ok(())
491 |         }
492 |     
493 |         json.write(b"],\n")?;
494 | 
495 |         if !ftrace_events.is_empty() {
496 |             let joined: String = ftrace_events.iter().map(|e| e.line.clone() + "\n").collect();
497 | 
498 |             json.write(br#""systemTraceEvents": "#)?;
499 |             //# tracer: nop is something Perfetto doesn't seem to need but the Chromium trace
500 |             //JSON spec insists is a must
501 |             json.write(Value::String("# tracer: nop\n".to_string() + &joined).to_string().as_bytes())?;
502 |             json.write(b",\n")?;
503 | 
504 |             let oldest_ftrace = ftrace_events[0].timestamp;
505 |             let newest_ftrace = ftrace_events[ftrace_events.len()-1].timestamp;
506 |             println!("  ftrace - {} events logged over {} cycles [{} - {}]", ftrace_events.len(), newest_ftrace-oldest_ftrace, oldest_ftrace-self.time_base, newest_ftrace-self.time_base);
507 |         }
508 | 
509 |         // find the source files containing the functions in this sample's set
510 |         let mut fileset: HashSet<String> = HashSet::new();
511 |         for sym in funcset.iter() {
512 |             fileset.insert(sym.file.clone());
513 |         }
514 |         json.write(br#""viztracer_metadata": {
515 |   "version": "0.16.3",
516 |   "overflow": false,
517 |   "producer": "funtrace2viz"
518 | },
519 | "file_info": {
520 | "files": {
521 | "#)?;
522 |     
523 |         // dump the source code of these files into the json
524 |         for (i, file) in fileset.iter().enumerate() {
525 |             if let Some(&ref source_code) = self.source_cache.get(file) {
526 |                 json.write(Value::String(file.clone()).to_string().as_bytes())?;
527 |                 json.write(b":[")?;
528 |                 json.write(source_code.json_str.as_bytes())?;
529 |                 json.write(b",")?;
530 |                 json.write(format!("{}", source_code.num_lines).as_bytes())?;
531 |                 json.write(if i==fileset.len()-1 { b"]\n" } else { b"],\n" })?;
532 |             }
533 |         }
534 |         json.write(br#"},
535 | "functions": {
536 | "#)?;
537 |     
538 |         // tell where each function is defined
539 |         for (i, sym) in funcset.iter().enumerate() {
540 |             // line-3 is there to show the function prototype in vizviewer/Perfetto
541 |             // (often the debug info puts the line at the opening { of a function
542 |             // and then the prototype is not seen, it can also span a few lines)
543 |             json.write(format!("{}:[{},{}]{}\n", json_name(sym), Value::String(sym.file.clone()).to_string(), if sym.line <= 3 { sym.line } else { sym.line-3 }, if i==funcset.len()-1 { "" } else { "," }).as_bytes())?;
544 |         }
545 |         json.write(b"}}}\n")?;
546 |     
547 |         Ok(())
548 |     }
549 | 
550 |     pub fn parse_chunks(&mut self, file_path: &String, json_basename: &String) -> io::Result<()> {
551 |         let mut file = File::open(file_path)?;
552 |     
553 |         let mut sample_entries: Vec<ThreadTrace> = Vec::new();
554 |         let mut num_json = 0;
555 | 
556 |         let mut thread_id = ThreadID { pid: 0, tid: 0, name: [0; 16] };
557 | 
558 |         let mut ftrace_text = "".to_string();
559 | 
560 |         self.procaddr2sym.input_source = Some(procaddr2sym::input_source(file_path.clone()));
561 | 
562 |         loop {
563 |             //the file consists of chunks with an 8-byte magic string telling the chunk
564 |             //type, followed by an 8-byte length field and then contents of that length
565 |             let mut magic = [0u8; MAGIC_LEN];
566 |             if file.read_exact(&mut magic).is_err() {
567 |                 break; // End of file
568 |             }
569 |     
570 |             let mut length_bytes = [0u8; LENGTH_LEN];
571 |             file.read_exact(&mut length_bytes)?;
572 |             let chunk_length = usize::from_ne_bytes(length_bytes);
573 |     
574 |             if &magic == b"FUNTRACE" {
575 |                 if chunk_length != 8 {
576 |                     println!("warning: unexpected length {} for FUNTRACE chunk", chunk_length);
577 |                     file.seek(SeekFrom::Current(chunk_length as i64))?;
578 |                     continue;
579 |                 }
580 |                 let mut freq_bytes = [0u8; 8];
581 |                 file.read_exact(&mut freq_bytes)?;
582 |                 self.cpu_freq = u64::from_ne_bytes(freq_bytes);
583 |             }
584 |             else if &magic == b"CMD LINE" {
585 |                 let mut cmd_bytes = vec![0u8; chunk_length];
586 |                 file.read_exact(&mut cmd_bytes)?;
587 |                 self.cmd_line = String::from_utf8(cmd_bytes).unwrap();
588 |             }
589 |             else if &magic == b"ENDTRACE" {
590 |                 if chunk_length != 0 {
591 |                     println!("warning: non-zero length for ENDTRACE chunk");
592 |                     file.seek(SeekFrom::Current(chunk_length as i64))?;
593 |                     continue;
594 |                 }
595 |                 if !sample_entries.is_empty() || !ftrace_text.is_empty() {
596 |                     if self.samples.is_empty() || self.samples.contains(&num_json) {
597 |                         self.write_sample_to_json(&format_json_filename(json_basename, num_json), &sample_entries, &ftrace_text)?;
598 |                     }
599 |                     else {
600 |                         println!("ignoring sample {} - not on the list {:?}", num_json, self.samples);
601 |                     }
602 |                     num_json += 1;
603 |                     sample_entries.clear();
604 |                     ftrace_text.clear();
605 |                 }
606 |             }
607 |             else if &magic == b"PROCMAPS" {
608 |                 //the content of the dumping process's /proc/self/maps to use when
609 |                 //interpreting the next trace samples (until another PROCMAPS chunk is encountered)
610 |                 let mut chunk_content = vec![0u8; chunk_length];
611 |                 file.read_exact(&mut chunk_content)?;
612 |                 self.procaddr2sym.set_proc_maps(chunk_content.as_slice());
613 |                 //the symbol cache might have been invalidated if the process unloaded and reloaded a shared object
614 |                 self.sym_cache = HashMap::new();
615 |             } else if &magic == b"THREADID" {
616 |                 if chunk_length != std::mem::size_of::<ThreadID>() {
617 |                     println!("Unexpected THREAD chunk length {} - expecting {}", chunk_length, std::mem::size_of::<ThreadID>());
618 |                     file.seek(SeekFrom::Current(chunk_length as i64))?;
619 |                     continue;
620 |                 }
621 | 
622 |                 file.read_exact(bytemuck::bytes_of_mut(&mut thread_id))?;
623 |             } else if &magic == b"TRACEBUF" {
624 |                 if chunk_length % mem::size_of::<FunTraceEntry>() != 0 {
625 |                     println!("Invalid TRACEBUF chunk length {} - must be a multiple of {}", chunk_length, mem::size_of::<FunTraceEntry>());
626 |                     file.seek(SeekFrom::Current(chunk_length as i64))?;
627 |                     continue;
628 |                 }
629 |     
630 |                 let num_entries = chunk_length / mem::size_of::<FunTraceEntry>();
631 |                 let mut entries = ThreadTrace { thread_id, trace: vec![FunTraceEntry { address: 0, cycle: 0 }; num_entries] };
632 |                 file.read_exact(bytemuck::cast_slice_mut(&mut entries.trace))?;
633 |                 entries.trace.retain(|&entry| !(entry.cycle == 0 && entry.address == 0));
634 |                 if !entries.trace.is_empty() {
635 |                     entries.trace.sort_by_key(|entry| entry.cycle);
636 |                     sample_entries.push(entries);
637 |                 }
638 |             } else if &magic == b"FTRACETX" {
639 |                 let mut ftrace_bytes = vec![0u8; chunk_length];
640 |                 file.read_exact(&mut ftrace_bytes)?;
641 |                 ftrace_text = String::from_utf8(ftrace_bytes).unwrap();
642 |             } else {
643 |                 println!("Unknown chunk type: {:?}", std::str::from_utf8(&magic).unwrap_or("<invalid>"));
644 |                 file.seek(SeekFrom::Current(chunk_length as i64))?;
645 |             }
646 |         }
647 |         if !sample_entries.is_empty() || !ftrace_text.is_empty() {
648 |             println!("warning: FUNTRACE block not closed by ENDTRACE");
649 |             self.write_sample_to_json(&format_json_filename(json_basename, num_json), &sample_entries, &ftrace_text)?;
650 |         }
651 |     
652 |         Ok(())
653 |     }
654 | }
655 | 
656 | fn format_json_filename(basename: &String, number: u32) -> String {
657 |     if number > 0 {
658 |         format!("{}.{}.json", basename, number)
659 |     } else {
660 |         format!("{}.json", basename)
661 |     }
662 | }
663 | 
664 | static mut PRINT_BIN_INFO: bool = false;
665 | 
666 | fn json_name(sym: &SymInfo) -> String {
667 |     //"unsafe" access to a config parameter... I guess I should have put stuff into a struct and have
668 |     //most methods operate on it to make it prettier or something?..
669 |     let print_bin_info = unsafe { PRINT_BIN_INFO };
670 |     if print_bin_info {
671 |         Value::String(format!("{} ({}:{} {:#x}@{})", sym.demangled_func, sym.file, sym.line, sym.static_addr, sym.executable_file)).to_string()
672 |     }
673 |     else {
674 |         Value::String(format!("{} ({}:{})", sym.demangled_func, sym.file, sym.line)).to_string()
675 |     }
676 | }
677 | 
678 | fn main() -> io::Result<()> {
679 |     let args = Cli::parse();
680 |     if args.max_event_age.is_some() && args.oldest_event_time.is_some() {
681 |         panic!("both --max-event-age and --oldest-event-time specified - choose one");
682 |     }
683 |     unsafe {
684 |         PRINT_BIN_INFO = args.executable_file_info;
685 |     }
686 |     let mut convert = TraceConverter::new(&args);
687 |     convert.parse_chunks(&args.funtrace_raw, &args.out_basename)
688 | }
689 | 
690 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # funtrace - a C/C++ function call tracer for x86/Linux
  2 | 
  3 | A function call tracer is a kind of profiler showing **a timeline of function call and return events**. Here's an example trace captured by funtrace from [Krita](https://krita.org):
  4 | 
  5 | ![image](images/krita-trace.png)
  6 | 
  7 | Here we can see 2 threads - whether they're running or waiting, and the changes to their callstack over time - and the source code of a selected function.
  8 | 
  9 | Unlike a sampling profiler such as perf, **a tracing profiler must be told what to trace** using some runtime API, and also has a **higher overhead** than the fairly low-frequency sampling of the current callstack a-la perf. What do you get in return for the hassle and the overhead (and the hassle of culling the overhead, by disabling tracing of short functions called very often)? Unlike flamegraphs showing where the program spends its time on average, traces let you **debug cases of unusually high latency**, including in production (and it's a great idea to collect traces in production, and not just during development!)
 10 | 
 11 | If you're interested in why tracing profilers are useful and how funtrace works, see [Profiling in production with function call traces](https://yosefk.com/blog/profiling-in-production-with-function-call-traces.html). What follows is a funtrace user guide.
 12 | 
 13 | - [Why funtrace?](#why-funtrace)
 14 | - [Trying funtrace](#trying-funtrace)
 15 | - [Runtime API for taking & saving trace snapshots](#runtime-api-for-taking--saving-trace-snapshots)
 16 | - ["Coretime" API for saving trace snapshots](#coretime-api-for-saving-trace-snapshots)
 17 | - [Choosing a compiler instrumentation method](#choosing-a-compiler-instrumentation-method)
 18 | - [Integrating funtrace into your build system](#integrating-funtrace-into-your-build-system)
 19 | - [Culling overhead with `funcount`](#culling-overhead-with-funcount)
 20 | - [Decoding traces](#decoding-traces)
 21 | - [Compile time & runtime configuration](#compile-time--runtime-configuration)
 22 |   - [Controlling which functions are traced](#controlling-which-functions-are-traced)
 23 |   - [Disabling & enabling tracing](#disabling--enabling-tracing)
 24 |   - [Controlling buffer sizes & lifetimes](#controlling-buffer-sizes--lifetimes)
 25 | - [Limitations](#limitations)
 26 | - [Funtrace file format](#funtrace-file-format)
 27 | 
 28 | # Why funtrace?
 29 | 
 30 | * **Low overhead tracing** - FWIW, in my microbenchmark I get <10 ns per instrumented call or return
 31 |   * **6x faster** than an LLVM XRay microbenchmark with "flight recorder logging" and 15-18x faster than "basic logging"
 32 |   * **4.5x faster** than a uftrace microbenchmark (note that uftrace isn't just designed for a somewhat different workflow than funtrace - in that it's similar to XRay - but it also has many more features; [check it out](https://github.com/namhyung/uftrace)!)
 33 | * Supports **threads, shared libraries and exceptions**
 34 | * Supports ftrace events, showing **thread scheduling states** alongside function calls & returns, so you see when time is spent waiting as opposed to computing
 35 | * Works with **stock gcc or clang** - no custom compilers or compiler passes
 36 | * Easy to integrate into a build system, and even easier to **try *without* touching the build system** using tiny compiler-wrapping scripts “passing all the right flags”
 37 | * Small (just ~1K LOC for the runtime) and thus:
 38 |   * **easy to port**
 39 |   * **easy to extend** (say, to support some variant of “green threads”/fibers)
 40 |   * **easy to audit** in case you’re reluctant to add something intrusive like this into your system without understanding it well (as I personally would be!)
 41 | * **Relatively comprehensive** – it comes with its own **tool for finding and cutting instrumentation overhead** in test runs too large to fully trace;
 42 |   support for remapping file paths to locate debug information and source code; a way to **extract trace data from core dumps**, etc.
 43 | 
 44 | # Trying funtrace
 45 | 
 46 | You can clone the repo & build the trace decoder (or uinzip [a binary release](https://github.com/yosefk/funtrace/releases)), compile & run a simple example program, and decode its output traces as follows:
 47 | 
 48 | ``` shell
 49 | # clone the source...
 50 | git clone https://github.com/yosefk/funtrace
 51 | # ...or unzip a binary release
 52 | unzip funtrace.zip
 53 | 
 54 | cd funtrace
 55 | ./simple-example/build.sh
 56 | ./simple-example/run.sh
 57 | ```
 58 | 
 59 | This actually tests 4 different instrumented builds - 2 with gcc and 2 with clang; we'll discuss below how to choose the best method for you. Troubleshooting:
 60 | 
 61 | * With an older clang, you'll get `clang: error: unknown argument: '-fxray-shared'` - in that case, you can use 3 instrumentation methods out of the 4.
 62 | * You might have issues accessing ftrace data. This is not a problem for _function tracing_ but it prevents _thread state tracing_, which could tell us when threads are running and when they're waiting:
 63 | 
 64 | ```
 65 | WARNING: funtrace - error initializing ftrace (...), compile with -DFUNTRACE_FTRACE_EVENTS_IN_BUF=0
 66 |   or run under `env FUNTRACE_FTRACE_EVENTS_IN_BUF=0` if you don't want to collect ftrace / see this warning
 67 | ```
 68 | 
 69 | You can ignore this message, or disable ftrace as described in the message, or you can try making ftrace work. The problem is usually permissions, and one way to make ftrace usable permissions-wise is **`sudo chown -R $USER /sys/kernel/tracing`**. Inside containers, things are more involved, and you might want to consult a source knowing more than this guide.
 70 | 
 71 | You can view the traces produced from the simple example above as follows:
 72 | 
 73 | ```
 74 | pip install viztracer
 75 | rehash
 76 | vizviewer out/funtrace-fi-gcc.json
 77 | vizviewer out/funtrace-pg.json
 78 | vizviewer out/funtrace-fi-clang.json
 79 | vizviewer out/funtrace-xray.json
 80 | ```
 81 | 
 82 | Funtrace uses [viztracer](https://github.com/gaogaotiantian/viztracer) for visualizing traces, in particular because of its ability to show source code, unlike stock [Perfetto](https://perfetto.dev/) (the basis for vizviewer.)
 83 | 
 84 | To build your own program with tracing enabled, you can use `compiler-wrappers/funtrace-pg-g++`, `compiler-wrappers/funtrace-finstr-clang++` or the other two compiler wrapper scripts, just like `simple-example/build.sh` does. If the program uses autoconf/configure, you can set the `$CXX` env var to point to one of these scripts, and if it uses cmake, you can pass `-DCMAKE_CXX_COMPILER=/your/chosen/wrapper` to cmake.
 85 | 
 86 | Note that the compiler wrappers slow down the configuration stage, because they compile & link funtrace.cpp, and this is costly at build system config time if the build system compiles many small programs to test for compiler features, library availability and such. For the build itself, the overhead of compiling funtrace.cpp is lower, but might still be annoying if you use a fast linker like mold and are used to near-instantaneous linking. The good thing about the compiler wrappers is that they make trying funtrace easy; if you decide to use funtrace in your program, however, you will probably want to pass the required compiler flags yourself as described below, which will eliminate the build-time overhead of the compiler wrappers.
 87 | 
 88 | Once the program compiles, you can run it as usual, and then `killall -SIGTRAP your-program` (or `kill -SIGTRAP <pid>`) when you want to get a trace. The trace will go to `funtrace.raw`; if you use SIGTRAP multiple times, many trace samples will be written to the file. Now you can run `funtrace2viz` the way `simple-example/run.sh` does. You get the funtrace2viz binary from `funtrace.zip`; if you cloned the source repo, you should have funtrace2viz compiled if you ran `simple-example/build.sh`. funtrace2viz will produce a vizviewer JSON file from each trace sample in funtrace.raw, and you can open each JSON file in vizviewer.
 89 | 
 90 | Troubleshooting vizviewer issues:
 91 | 
 92 | * If you see **`Error: RPC framing error`** in the browser tab opened by vizviewer, **reopen the JSON from the web UI**. (Note that you want to run vizviewer on every new JSON file, _even if_ it gives you "RPC framing error" when you do it - you _don't_ want to just open the JSON from the web UI since then you won't see source code!)
 93 | * If **the timeline looks empty**, it's likely due to some mostly-idle threads having very old events causing the timeline to zoom out too much. (You can simply open the JSON with `less` or whatever - there's a line per function call; if the JSON doesn't look empty, funtrace is working.) **Try passing `--max-event-age` or `--oldest-event-time` to funtrace2viz**; it prints the time range of events recorded for each thread in each trace sample (by default, the oldest event in every sample gets the timestamp 0) and you can use these printouts to decide on the value of the flags. In the next section we'll discuss how to take snapshots at the time you want, of the time range you want, so that you needn't fiddle with flags this way.
 94 | 
 95 | If you build the program, run it, and decode its trace on the same machine/in the same container, life is easy. If not, note that in order for funtrace2viz to work, you need the program and its shared libraries to be accessible at the paths where they were loaded from _in the traced program run_, on the machine _where funtrace2viz runs_. And to see the source code of the functions (as opposed to just function names), you need the source files to be accessible on that machine, at the paths _where they were when the program was built_. If this is not the case, you can remap the paths using a file called `substitute-path.json` in the current directory of funtrace2viz, as described below.
 96 | As a side note, if you don't like having to remap source file paths - not just in funtrace but eg in gdb - see [refix](https://github.com/yosefk/refix) which can help to mostly avoid this.
 97 | 
 98 | Note that if you choose to try XRay instrumentation (`compiler-wrappers/funtrace-xray-clang++`), you need to run with `env XRAY_OPTIONS="patch_premain=true"` like simple-examples/run.sh does. With the other instrumentation options, tracing is on by default.
 99 | 
100 | The above is how you can give funtrace a quick try. The rest tells how to integrate it in your program "for real."
101 | 
102 | # Runtime API for taking & saving trace snapshots
103 | 
104 | The next thing after trying funtrace with SIGTRAP is probably using the runtime API to take snapshots of interesting time ranges. (Eventually you'll want proper build system integration - but you probably want to "play some more" beforehand, and since snapshots taken with SIGTRAP aren't taken at "the really interesting times" and capture too much, you'll want to see better snapshots.)
105 | 
106 | The recommended method for taking & saving snapshots is:
107 | 
108 | * using `funtrace_time()` to find unusually high latency in every flow you care about
109 | * ...then use `funtrace_pause_and_get_snapshot_starting_at_time()` to capture snapshots when a high latency is observed
110 | * ...finally, use `funtrace_write_snapshot()` when you want to save the snapshot(s) taken upon the highest latencies
111 | 
112 | In code, it looks something like this:
113 | 
114 | ```c++
115 | #include "funtrace.h"
116 | 
117 | void Server::handleRequest() {
118 |   uint64_t start_time = funtrace_time();
119 | 
120 |   doStuff();
121 | 
122 |   uint64_t latency = funtrace_time() - start_time;
123 |   if(latency > _slowest) {
124 |     funtrace_free_snapshot(_snapshot);
125 |     _snapshot = funtrace_pause_and_get_snapshot_starting_at_time(start_time);
126 |     _slowest = latency;
127 |   }
128 | }
129 | 
130 | Server::~Server() {
131 |   funtrace_write_snapshot("funtrace-request.raw", _snapshot);
132 |   funtrace_free_snapshot(_snapshot);
133 | }
134 | ```
135 | 
136 | There's also `funtrace_pause_and_get_snapshot_up_to_age(max_event_age)` - very similar to `funtrace_pause_and_get_snapshot_starting_at_time(start_time)`; and if you want the full content of the trace buffers without an event age limit, there's `funtrace_pause_and_get_snapshot()`. And you can write the snapshot straight from the threads' trace buffers to a file, without allocating memory for a snapshot, using `funtrace_pause_and_write_current_snapshot()` (this is exactly what the SIGTRAP handler does.)
137 | 
138 | As implied by their names, **all of these functions pause tracing until they're done** (so that traced events aren't overwritten with new events before we have the chance to save them.) This means that, for example, a concurrent server where `Server::handleRequest()` is called from multiple threads might have a gap in one of the snapshots taken by 2 threads at about the same time; hopefully, unusual latency in 2 threads at the same time is rare, and even if does happen, you'll get at least one good snapshot.
139 | 
140 | All of the snapshot-saving functions write to files; an interface for sending the data to some arbitrary stream could be added given demand.
141 | 
142 | Finally, a note on the time functions:
143 | 
144 | * `funtrace_time()` is a thin wrapper around `__rdtsc()` so you needn't worry about its cost
145 | * `funtrace_ticks_per_second()` gives you the TSC frequency in case you want to convert timestamps or time diffs to seconds/ns
146 | 
147 | # "Coretime API" for saving trace snapshots
148 | 
149 | While we're on the subject of snapshots - you can get trace data from a core dump by loading `funtrace_gdb.py` from gdb - by running `gdb -x funtrace_gdb.py`, or using the gdb command `python execfile("funtrace_gdb.py")`, or somewhere in `.gdbinit`. Then you'll get the extension command `funtrace` which works something like this:
150 | 
151 | ```
152 | (gdb) funtrace
153 | funtrace: saving proc mappings
154 | funtrace: core dump generated by `your-program arg1 arg2`
155 | funtrace: thread 1287700 your-program - saving 1048576 bytes of data read from 0x7fb199c00000
156 | funtrace: thread 1287716 child - saving 1048576 bytes of data read from 0x7fb17c200000
157 | funtrace: saving 22 ftrace events
158 | funtrace: done - decode with `funtrace2viz funtrace.raw out` and then view in viztracer (pip install viztracer) with `vizviewer out.json`
159 | ```
160 | 
161 | Basically it's what SIGTRAP would save to `funtrace.raw`, had it been called right when the core was dumped. Can be very useful to see what the program was doing right before it crashed.
162 | 
163 | # Choosing a compiler instrumentation method
164 | 
165 | Once you have snapshots of the right time ranges, you might want to settle on a particular compiler instrumentation method. For that, the below can be helpful as well as the next section, which talks about culling overhead with the `funcount` tool (one thing which will help you choose the instrumentation method is how much overhead it adds, which differs between programs, and funcount can help estimate that overhead.)
166 | 
167 | Funtrace relies on the compiler inserting hooks upon function calls and returns. Funtrace supports 4 instrumentation methods (2 for gcc and 2 for clang), and comes with a compiler wrapper script passing the right flags to use each:
168 | 
169 | * **funtrace-finstr-g++** - gcc with `-finstrument-functions`
170 | * **funtrace-pg-g++** - gcc with `-pg -mfentry -minstrument-return=call`
171 | * **funtrace-finstr-clang++** - clang with `-finstrument-functions`
172 | * **funtrace-xray-clang++** - clang with `-fxray-instrument`
173 | 
174 | **"By default," the method used by funtrace-pg-g++ and funtrace-finstr-clang++ is recommended for gcc and clang, respectively**. However, for each compiler, there are reasons to use the other method. Here's a table of the methods and their pros and cons, followed by a detailed explanation:
175 | 
176 | Method | gcc -finstr | gcc -pg | clang -finstr | clang XRay
177 | --- | --- | --- | --- | --- 
178 | before or after inlining? | ❌ before | ✅ after | ✅✅ before or after! | ✅ after
179 | control tracing by source path | ✅ yes | ❌ no | ❌ no | ❌ no
180 | control tracing by function length | ✅ asm | ✅ asm | ✅ asm | ✅✅ compiler
181 | control tracing by function name list | ✅ asm | ✅ asm | ✅ asm | ❌ no
182 | tail call artifacts | ✅ no | ❌ yes | ✅ no | ❌ yes
183 | untraced exception catcher artifacts | ✅ no | ❌ yes | ❌ yes | ❌ yes
184 | needs questionable linker flags | ✅ no | ❌ yes | ✅ no | ❌ yes
185 | 
186 | We'll now explain these items in detail, and add a few points about XRay which "don't fit into the table."
187 | 
188 | * **Instrument before or after inlining?** You usually prefer "after" - "before" is likely to hurt performance too much (and you can use the NOFUNTRACE macro to suppress the tracing of a function, but you'll need to do this in too many places.) Still, instrumenting before inlining has its uses, eg you can trace the program flow and follow it in vizviewer - for an interactive and/or multithreaded program, this might be easier than using a debugger or an IDE. clang -finstrument-functions is the nicest here - it instruments before inlining, but has a sister flag -finstrument-functions-after-inlining that does what you expect.
189 | * **Control tracing by source path** - gcc's `-finstrument-functions-exclude-file-list=.h,.hpp,/usr/include` (for example) will disable tracing in functions with filenames having the substrings on the comma-separated list. This can somewhat compensate for -finstrument-functions instrumenting before inlining, and you might otherwise use this feature for "targeted tracing." 
190 | * **Control tracing by function length** - XRay has `-fxray-instruction-threshold=N` which excludes short functions from tracing, unless they have loops that XRay assumes will run for a long time. For other instrumentation methods, funtrace comes with its own flag, `-funtrace-instr-thresh=N`, which is implemented by post-processing the assembly code produced by the compiler (funtrace supplies a script, `funtrace++`, which calls the compiler with `-S` instead of `-c` and then post-processes the assembly output and assembles it to produce the final `.o` object file.) XRay's method has 2 advantages, however. Firstly, it removes 100% of the overhead, while funtrace's method removes most (the on-entry/return hooks aren't called), but not all overhead (some extra instructions will appear relatively to the case where the function wasn't instrumented by the compiler in the first place.) Secondly, while the rest of funtrace is very solid, this bit is "hacky"/somewhat heuristical text processing of your compiler-generated assembly, and while it "seems to work" on large programs, you might have reservations against using this in production.
191 | * **Control tracing by function name list** - for all methods other than XRay instrumentation, funtrace provides the flags `-funtrace-do-trace=file` and `-funtrace-no-trace=file` which let you specify which functions to exclude - or not to exclude - from tracing during assembly postprocessing (if you decide to use this postprocessing, of course.) This is nice for functions coming from .h files you cannot edit (and thus can't add the `NOFUNTRACE` attribute to the functions you want to exclude); it can also be nice to take a bunch of "frequently callees" reported by the funcount tool (described below) and suppress them using a list of mangled function names, instead of going to the source location of each and adding `NOFUNTRACE` there, especially during experimentation where you trying to check what suppressing this or that does for the overhead. This doesn't work for XRay ATM (assembly postprocessing could probably be implemnted for XRay but would require editing compiler-generated metdata used by the XRay runtime.)
192 | * **Tail call artifacts** is when f calls g, the last thing g does is calling h, and instead of seeing f calling g _which calls h_, you see f calling g _and then h_. This happens because the compiler calls the "on return" hook from g before g's tail call to h. An annoyance if not a huge deal.
193 | * **Untraced exception catcher artifacts** is when you have a function with a `try/catch` block _and_ tracing is disabled for it. In such a case, when an exception is thrown & caught, it looks like _all_ the functions returned and you start from a freshly empty call stack - instead of the correct picture (returning to the function that caught the exception.) This artifact comes from most instrumentation methods not calling the "on return" hook when unwinding the stack. This annoyance is avoided as long as you enable tracing for functions catching exceptions (in which case funtrace traces enough info to get around the return hook not being called upon unwinding.)
194 | * **Questionable linker flags**:
195 |   * **clang XRay requires --allow-multiple-definition**. That's because funtrace needs to redefine XRay's on-call/on-return hooks, and there doesn't seem to be another way to do it. If XRay defines its hooks as "weak", this flag will no longer be needed.
196 |   * **gcc -pg _precludes_ -Wl,--no-undefined**. That's because its on-return hook, `__return__`, doesn't have a default definition (though its on-entry hook, `__fentry__`, apprently does, as do the entry/return hooks called by -finstrument-functions); your shared objects will get it from the executable but they won't link with `-Wl,--no-undefined`. Note that _all_ the wrappers filter out `-Wl,--no-undefined` so that shared libraries can use the `funtrace_` runtime APIs exported by the executable. However, you don't have to use the runtime APIs in shared objects - you can take snapshots only from code linked into the executable - so except for the -pg mode, this flag is not strictly necessary.
197 | 
198 | A few more words about XRay:
199 | 
200 | * **XRay instrumentation was enabled in shared libraries in late 2024** and is not yet available in officially released versions. clang versions with XRay shared library support have the `-fxray-shared` flag.
201 | * **XRay uses dynamic code patching for enabling/disabling tracing at runtime.** This is why tracing is off unless you run under `env XRAY_OPTIONS="patch_premain=true"`, or use XRay's runtime APIs to patch the code. Funtrace has its own API, `funtrace_enable/disable_tracing()`, but it deliberately _doesn't_ call XRay's code-patching APIs. Funtrace's API is a quick way to cut most of the overhead of tracing without any self-modifying code business. It's up to you to decide, if you use XRay, whether you want to cut even more overhead by using runtime patching - downsides include creating copies of the code pages, for which you might not have the extra space, and taking more time than funtrace_enable/disable_tracing().
202 | 
203 | # Integrating funtrace into your build system
204 | 
205 | You can postpone "real" build system integration for as long as you want, if the compiler wrappers don't slow things down too much for you.
206 | Once you do want to integrate funtrace into your build system, the short story is, **choose an instrumentation method and then compile in the way the respective wrapper in compiler-wrappers does.** However, here are some points worth noting explicitly:
207 | 
208 | * **It's fine to compile funtrace.cpp with its own compilation command.** You probably don't want to compile funtrace.cpp when linking your binary the way the wrappers do. They only do it to save you the trouble of adding funtrace.cpp to the list of files for the build system to build (which is harder/more annoying than it sounds, if you're trying to trace someone else's program with a build system you don't really know.)
209 | * **It's best to compile funtrace.cpp without tracing, but "it can handle" being compiled with tracing.** Many build systems make it hard to compile a given file with its own compiler flags. funtrace.cpp uses NOFUNTRACE heavily to suppress tracing; the worst that can happen if you compile it with tracing is that some of its code will be traced despite its best efforts, but it should otherwise work.
210 | * **funtrace.cpp must be compiled _into the executable_, not any of the shared libraries.** Funtrace uses TLS (thread-local storage) and accessing a `thread_local` object is a simple register+offset access when you link the code into an executable, but requires a function call if you link the code into a shared library, because now you need to find _this shared library's TLS area_. So funtrace puts its on-entry/return hooks into the executable, which exports them to the shared libraries.
211 | * **Linker flag requirements** (XRay/`--allow-multiple-definition`, -pg/`-Wl,--no-undefined`) are documented in the previous section; for XRay, you also **need a linker wrapper** like `compiler-wrappers/xray/ld` to make sure funtrace's on-entry/return hooks from funtrace.o are passed before XRay's own hooks on the linker command line.
212 | * **Pass -pthread** or things will break annoyingly
213 | * **-Wl,--dynamic-list=funtrace.dyn** exports the funtrace runtime API from the executable for the shared libraries
214 | * **-g is for source line info** (it's generally a good idea to use -g in release builds and not just debug builds - if it slows down linking, mold takes care of that; but, if you don't want to compile with -g, funtrace will still give you the function names using the ELF symbol table, only the source code will be missing from vizviewer)
215 | * **Do _not_ pass -pg _to the linker_** - if you use gcc with -pg, and do pass it to the linker, the linker will think that you're compiling for gprof (even if you also pass `-mfentry -minstrument-return=call` which are guaranteed to break gprof, -pg's original application...), and then your program will write a useless gmon.out file in the current directory every time you run it.
216 | * **Some flags in the wrappers are "defaults" that you can change**, specifically:
217 |   * `g++ -finstrument-functions-exclude-file-list=.h,.hpp,/usr/include` - of course you can pass a different exclude list
218 |   * `clang++ -finstrument-functions-after-inlining` - you can instead pass -finstrument-functions to instrument before inlining
219 |   * `-fxray-instruction-threshold=...` is _not_ passed by the XRay wrapper - you can set your own threshold
220 | * **Link the program as C++** - even if it's a C program, the funtrace runtime is in C++ and you'll need to link with g++ or clang++ for things to work
221 |  
222 | All the compiler wrappers execute `compiler-wrappers/funtrace++`, itself a compiler wrapper which implements a few flags - `-funtrace-instr-thresh=N`, `-funtrace-ignore-loops`, `-funtrace-do-trace=file`, and `-funtrace-no-trace=file` - for controlling which function get traced, by changing the assembly code produced by the compiler. If you don't need any of these flags, you needn't prefix your compilation command with `funtrace++` like the wrappers do. (Funtrace needn't touch the code generated by the compiler for any reason other than supporting these flags.)
223 | 
224 | # Culling overhead with `funcount`
225 | 
226 | If tracing slows down your program too much, you might want to exclude some functions from tracing. You can do this on some "wide basis", such as "no tracing inside this bunch of libraries, we do compile higher-level logic to trace the overall flow" or such. You can also use `-fxray-instruction-threshold` or `-funtrace-instr-thresh` to automatically exclude short functions without loops. But you might also want to do some "targeted filtering" where you **find functions called very often, and exclude those** (to save both cycles and space in the trace buffer - with many short calls, you need a much larger snapshot to see far enough into the past.)
227 | 
228 | `funcount` is a tool for counting function calls, which is recommended for finding "frequent callees" to exclude from traces. Funcount is:
229 | 
230 | * **Fast** (about as fast as funtrace and unlike the very slow callgrind)
231 | * **Accurate** (unlike perf which doesn't know how many time a function was called, only how many cycles were spent there and only approximately with its low frequenchy sampling)
232 | * **Thread-safe** (unlike gprof which produces garbage call counts with multithreaded programs)
233 | * **Small** (~300 LOC) and easy to port
234 | 
235 | Finally, funcount **counts exactly the calls funtrace would trace** - nothing that's not traced is counted, and nothing that's traced is left uncounted.
236 | 
237 | You enable funcount by passing `-DFUNTRACE_FUNCOUNT` on the command line (only `funtrace.cpp` and `funtrace_pg.S` need this -D, you don't really need to recompile the whole program), or by compiling & linking `funcount.cpp` and `funcount_pg.S` instead of `funtrace.cpp` and `funtrace_pg.S` into your program - whichever is easier in your build system. If the program runs much slower than with funtrace (which can be very slow if you instrument before inlining but otherwise is fairly fast), it must be multithreaded, with the threads running the same concurrently and fighting over the ownership of the cache lines containing the call counters maintained by funcount. You can compile with `-DFUNCOUNT_PAGE_TABLES=16` or whatever number to have each CPU core update its own copy of each call counter, getting more speed in exchange for space (not that much space - each page table is at worst the size of the executable sections, though on small machines this might matter.)
238 | 
239 | At the end of the run, you will see the message:
240 | 
241 | `function call count report saved to funcount.txt - decode with funcount2sym to get: call_count, dyn_addr, static_addr, num_bytes, bin_file, src_file:src_line, mangled_func_name`
242 | 
243 | `funcount2sym funcount.txt` prints the columns described in the message to standard output; the most commonly interesting ones are highlighted in bold:
244 | 
245 | * **`call_count` - the number of times the function was called**
246 | * `dyn_addr` - the dynamic address of the function as loaded into the process (eg what you'd see in `gdb`)
247 | * `static_addr` - the static address of the function in the binary file (what you'd see with `nm`)
248 | * `num_bytes` - the number of bytes making up the function, a proxy for how many instructions long it is
249 | * `bin_file` - the executable or shared library containing the function
250 | * **`src_file:src_line` - the source file & line where the function is defined**, separated by ":"
251 | * **`mangled_func_name` - the mangled function name**; you can pipe funcount2sym through `c++filt` to demangle it, though often you will want the mangled name
252 | 
253 | You can sort this report with `sort -nr` and add reports from multiple runs together with `awk`. To exclude frequently called functions from tracing, you can use the `NOFUNTRACE` attribute (as in `void NOFUNTRACE myfunc()`); `#include "funtrace.h"` to access the macro. You can also use the `-funtrace-no-trace=file` flag implemented by `funtrace++`, and pass it a file with a list of _mangled_ function names. See also "Disabling and enabling tracing" below. This might be faster than opening every relevant source file and adding `NOFUNTRACE` to every excluded function definition, and it avoids issues where the function attribute doesn't exclude the function for whatever reason.
254 | 
255 | The advantage of the NOFUNTRACE attribute, apart from being kept together with the function definition (so you know easily what's traced and what's not), is that the overhead is **fully** removed, whereas `-funtrace-no-trace=file` only removes most of the overhead - it removes the calls to the entry/exit hooks, but the code is still "scarred" by the code having been generated. This is a small fraction of the overhead but if lots and lots of functions are "scarred" this way, it can add up.
256 | 
257 | If the source files aren't where the debug info says they are, and/or the executable or shared objects are not where they were when the process was running, you can use `substitute-path.json` in the current directory of `funcount2sym` same as with `funtrace2viz`, as described in the next section.
258 | 
259 | # Decoding traces
260 | 
261 | `funtrace2viz funtrace.raw out` will produce an `out.json`, `out.1.json`, `out.2.json` etc. per trace sample in the file. (The snapshot-saving functions only put one sample into a file; the `funtrace.raw` file appended to by SIGTRAP and its programmatic equivalent can contain multiple samples.)
262 | 
263 | If funtrace2viz can't find some of the source files or binaries it needs, it will print warnings; you can make it find the files using a `substitute-path.json` in its current directory. This JSON file should contain an array of arrays of length 2, for example:
264 | 
265 | ``` json
266 | [
267 |   ["/build/server/source-dir/","/home/user/source-dir/"],
268 |   ["/deployment/machine/binary-dir/","/home/user/binary-dir/"],
269 | ]
270 | ```
271 | For every path string, funtrace2viz iterates over every pair in the array, replacing every occurence of the first string with the second string in the pair.
272 | 
273 | Command line flags:
274 | 
275 | * `-r/--raw-timestamps`: report the raw timestamps, rather than defining the earliest timestamp in each sample as 0 and counting from there
276 | * `-e/--executable-file-info`: on top of a function's name, file & line, show the binary it's from and its static address
277 | * `-m/--max-event-age`: ignore events older than this age; this is most likely to be useful for SIGTRAP-type snapshots where you have very old events from mostly idle threads and they cause the GUI timeline to zoom out so much you can't see anything. You can guess what the age is in part by looking at the printouts of funtrace2viz which tells the time range of the events traced from each thread
278 | * `-e/--oldest-event-time`: like `--max-event-age` but with the threshold defined as a timestamp instead of age
279 | * `-t/--threads`: a comma-separated list of thread TIDs - threads outside this list are ignored (including for the purpose of interpreting `--max-event-age` - if you ignore the thread with the most recent event, then the most recent event from threads you didn't ignore becomes "the most recent event" for age calculation purposes.) This is also something that's mostly useful for SIGTRAP-type snapshots to exclude mostly idle threads
280 | * `-s/--samples`: a comma-separated list of sample indexes - samples outside this list are ignored. Useful for the multi-sample `funtrace.raw` file appended to by SIGTRAP
281 | * `-d/--dry`: useful for a very large multi-sample `funtrace.raw` file if you want to decide what samples to focus on; this prints the time ranges of the threads in each sample, but doesn't decode anything (decoding runs at a rate of about 1MB of binary data per second)
282 | 
283 | # Compile-time & runtime configuration
284 | 
285 | ## Controlling which functions are traced
286 | 
287 | Control at function granularity is only available at build time, as follows:
288 | 
289 | * **Compiler function attributes**:
290 |   * `NOFUNTRACE` - a function attribute excluding a function from tracing (eg `void NOFUNTRACE func()` - this is the `__attribute__((...))` syntax of gcc/clang).
291 |   * `DOFUNTRACE` - a function attribute forcing the inclusion of a function in tracing - currently only meaningful for XRay, which might otherwise exclude functions due to the `-fxray-instruction-threshold=N` flag
292 | * **Assembly filtering flags**: if you use the `funtrace++` wrapper around g++/clang++ in your build system (which you'd want to do solely to get the flags below), you get the option to filter compiler-generated assembly code to exclude some functions from tracing; this is convenient with foreign code (eg functions in standard or external library header files) as well as "to cast a wide net" based on function length a-la XRay's `-fxray-instruction-threshold=N` (_note that assembly filtering is not supported with XRay_):
293 |   * `-funtrace-do-trace=file` - the file should contain a list of whitespace-separated mangled function names, these functions will NOT excluded from tracing
294 |   * `-funtrace-no-trace=file` - the file should contain a list of whitespace-separated mangled function names, these functions WILL be excluded from tracing
295 |   * `-funtrace-instr-thresh=N` - functions with less than N instructions will be excluded from tracing together with function calls inlined into them, UNLESS they have loops
296 |   * `-funtrace-ignore-loops` - if -funtrace-instr-thresh=N was passed, functions with less than N instructions will be excluded from tracing together with function calls inlined into them, EVEN IF they have loops
297 | 
298 | There are thus several ways to ask to include or exclude a function from tracing; what happens if they conflict?
299 | 
300 | * NOFUNTRACE "always wins" (unless there's a compiler issue where it's ignored for whatever reason) - you can't trace a function successfully excluded with NOFUNTRACE
301 | * DOFUNTRACE currently only means the function will survive XRay filtering; it does nothing for other instrumentation methods, so the function might be exluded from tracing with these methods (eg by -finstrument-functions-after-inling or -finstrument-functions-exclude-file-list)
302 | * For functions which "survived exclusion by the compiler":
303 |   * A function on the list passed to -funtrace-do-trace is always kept
304 |   * Otherwise, a function on the list passed to -funtrace-no-trace is excluded, and so are function calls inlined into it
305 |   * Otherwise, a function with less than N instructions where N was defined with -funtrace-instr-thresh=N and has no loops is excluded, and so are function calls inlined into it. If it has loops but -funtrace-ignore-loops was passed, it is also excluded, and so are function calls inlined into it.
306 |  
307 | ## Disabling & enabling tracing
308 | 
309 | * `funtrace_ignore_this_thread()` excludes the calling thread from tracing "forever" (there's currently no way to undo this)
310 | * `funtrace_disable_tracing()` disables tracing globally (note that taking a snapshot effectively does the same thing until the snapshot is ready)
311 | * `funtrace_enable_tracing()` (re-)enables the tracing globally (by default, tracing is on when the program starts so you needn't do it; "on by default" means you can get a trace from a core dump and from a live process with SIGTRAP without any tweaking to the program source)
312 | 
313 | Additionally, compiling with -DFUNTRACE_FTRACE_EVENTS_IN_BUF=0 or setting $FUNTRACE_FTRACE_EVENTS_IN_BUF to 0 at runtime effectively disables ftrace scheduling event tracing, as mentioned again in the next section.
314 | 
315 | ## Controlling buffer sizes & lifetimes
316 | 
317 | * `funtrace_set_thread_log_buf_size(log_buf_size)` sets the trace buffer size of the calling thread to `pow(2, log_buf_size)`. Passing 0 (or a value smaller than log(size of 2 trace entries), so currently 5) is equivalent to calling `funtrace_ignore_this_thread()`
318 | * The following parameters can be controlled by passing `-DNAME=VALUE` to the compiler (the command line equivalent of `#define NAME VALUE`), and/or reconfigured at runtime by setting the environment variable `$NAME` to `VALUE`:
319 |   * `FUNTRACE_LOG_BUF_SIZE`: each thread starts with a thread-local trace buffer of this size (the default is 20, meaning 1M bytes = 32K trace entries ~= 16K most recent function calls.) This initial buffer size can then be changed using `funtrace_set_thread_log_buf_size()`
320 |   * `FUNTRACE_FTRACE_EVENTS_IN_BUF`: the number of entries in this process's userspace ftrace buffer (the default is 20000; the size in bytes can vary since each entry keeps one line of textual ftrace data.) Passing `-DFUNTRACE_FTRACE_EVENTS_IN_BUF=0` disables ftrace at compile time - this **cannot** be changed by setting the env var at runtime to a non-zero value.
321 |   * `FUNTRACE_GC_MAX_AGE_MS`: when set to 0, a thread's thread-local trace buffer is freed upon thread exit - which means the trace data will be missing from future snapshots, even though the events in that buffer might have been recorded during the time range covered by the snapshot. When set to a non-zero value (default: 300 ms), thread trace buffers are kept after thread exit, and garbage-collected every FUNTRACE_GC_PERIOD_MS (see below); only buffers with age exceeding FUNTRACE_GC_MAX_AGE_MS are freed. Passing `-DFUNTRACE_GC_MAX_AGE_MS` disables garbage collection at compile time - this **cannot** be changed by setting the env var at runtime to a non-zero value.
322 |   * `FUNTRACE_GC_PERIOD_MS`: unless compiled out by #defining FUNTRACE_GC_MAX_AGE_MS to 0, the thread trace buffer garbage collection runs every FUNTRACE_GC_PERIOD_MS ms (default: the compile-time value of FUNTRACE_GC_MAX_AGE_MS.)
323 | 
324 | # Limitations
325 | 
326 | * **Can't trace inside shared libraries unless they're loaded by an executable containing the funtrace runtime** - for example, a Python extension module written in C++ can't be traced, similarly to any other kind of plugin loaded by a program not compiled with funtrace. This is because of the TLS issue explained above.
327 | * **Thread creation/exit and saving a trace snapshot take the same lock** - this can slow things down; hopefully not too badly since saving a snapshot is pretty fast, and creating lots of threads at runtime (rather than reusing from a thread pool) should be rare
328 | * **ftrace / thread scheduling events might have issues near the snapshot time range boundaries**:
329 |   * Perfetto might not render thread status very clearly near the boundaries even when it's clear from the ftrace log
330 |   * There's a latency between a thread scheduling event and the moment it's delivered to funtrace's userspace thread collecting the events (we try to give this thread a high priority but will typically lack permissions to give it a real-time priority.) One way around this could be *a mechanism for "late delivery" of ftrace events into snapshots* - since most of the time, snapshots are written to the file system much later than they're captured, we could put ftrace events into those already-captured, but not-yet-written-out snapshots whose time range contains a given newly arrived event. Doable, but a bit of a hassle, could be done given demand.
331 | * **Threads which exited by the time a snapshot was taken might be invisble in the trace** - unless the thread trace GC parameters were tuned such that the trace buffer is still around when the snapshot is taken, as explained above
332 | * **Funcount misses constructor calls** - shouldn't matter for its goal of finding functions called so often that you want to exclude them from tracing to avoid the overhead
333 | * **Overlapping time ranges** should never happen but might in some cases. The Perfetto/Chromium JSON spec requires events' time ranges to be nested within each other or not overlap at all. funtrace2viz takes this requirement seriously (rather than breaking it on the currently seemingly correct theory that some ways of breaking it are actually supported.) So when funtrace2viz observes that 20 functions have just returned (by seeing that f which called 19 functions has just returned, perhaps because of a longjmp or an exception being caught), it produces 20 different timestamps apart by at least 1 ns, the smallest time unit in the JSON. Some of these made-up return timestamps might cause overlap with later function calls.
334 | * **Tail call artifacts** with some instrumentation methods, as documented in the section "Choosing compiler instrumentation"
335 | * **Untraced exception catcher artifacts** with some instrumentation methods, as documented in the section "Choosing compiler instrumentation." A related but likely extremely rare artifact you might see with these instrumentation methods is mixing recursion and exception handling where you have a recursive function that doesn't catch an exception at the innermost recursion level but then does catch it at another level - funtrace trace analysis will incorrectly assume the exception was caught at the innermost level (unless `gcc -finstrument-functions` was used, which calls the on-return hook when unwinding the stack and doesn't require guesswork at trace analysis time.)
336 | * **Unloading traced shared libraries within the time range of a snapshot is unsupported** - a trace snapshot contains an address space snapshot made at the end of the time range, so if a shared library was unloaded, functions traced from it will not be decodable in the trace; reusing the executable address space for new addresses will mess up decoding further. A need to dlclose libraries midway thru the tracing is probably extremely rare.
337 | * **Mixing instrumentation methods in the same build or process wasn't tested** and might not work for various reasons; this feels like a fairly esoteric need, but can almost certainly be made to work given demand.
338 | 
339 | # Funtrace file format
340 | 
341 | You don't need to know this format unless you want to generate or process `funtrace.raw` files, or extend funtrace for your needs.
342 | 
343 | Funtrace data is binary, using little endian encoding for integers. It consists of "chunks" where each chunk has an 8-byte magic number, a 64-bit size integer, and then a sequence of data bytes of the length specified by the size integer. Here are the chunk types and the format of the data:
344 | 
345 | * **`PROCMAPS`**: the content of `/proc/self/maps` can go here; only the start, end, offset and path fields are used, and only the executable segments are listed at this stage (funtrace uses `dl_iterate_phdr` rather than `/proc/self/maps` to speed up snapshotting), but readonly data segments might go here eventually, too, eg if we implement custom log messages with [delayed formatting](https://yosefk.com/blog/delayed-printf-for-real-time-logging.html). Only the start, end, offset and path fields are used; permissions and inode info are ignored.
346 | * **`FUNTRACE`**: an 8-byte chunk indicating the start of a snapshot, with an 8-byte frequency of the timestamp counter, used to convert counter values into nanoseconds. A snapshot is interpreted according to the memory map reported by the last encountered `PROCMAPS` chunk (there may be many snapshots in the same file; currently the funtrace runtime saves a `PROCMAPS` chunk every time it takes a snapshot but if you know that your memory map remains stable over time and you want to shave off a little bit of latency, you could tweak this.)
347 | * **`CMD LINE`**: the process command line, used as the process name when generating the JSON. A wart worth mentioning is that currently, the funtrace runtime reads this from `/proc/self/cmdline` and replaces null characters separating the arguments with spaces, which means that the shell command `prog "aaa bbb"`, which passes a single string argument `aaa bbb`, will be saved as `prog aaa bbb` (two string arguments). So we save enough to help you see "the trace of what you're looking at" but not enough to eg use the saved command line for reproducing the run.
348 | * **`THREADID`**: a 64b PID integer, a 64b TID integer, and a null-terminated 16-byte name string (the content of `/proc/self/comm` aka the output of `pthread_getname_np(pthread_self(),...)`.) This precedes every `TRACEBUF` chunk (documented next.)
349 | * **`TRACEBUF`**: a variable sized chunk of length which is a multiple of 16. It contains trace entries; each entry is a 64b code pointer, and a 64b timestamp counter value. The entries are _not_ sorted by the timestamp, for 2 reasons - they come from a cyclic buffer, and the funtrace writeout code is racy, so you can have rare cases of `new_entry, old_entry, new_entry` near the end of the cyclic buffer because one of the newest entries didn't make it into the buffer so you got a much older entry. So you need to sort the entries for processing, and you need to "defend" against missing events (meaning, you could see a return without a call or a call without a return; this is not just because of the raciness of the writeout but because the cyclic buffer ends before "the end of program execution" and starts after "the start of execution" and you can have various other niceties like longjmp.) The code pointer can have the following flags set in its high bits:
350 |   * `RETURN` (63): a return event, where the code pointer points into the returning function
351 |   * `RETURN_WITH_CALLER_ADDRESS` (62): a return event where the code pointer points _into the function we're returning to_. This unfortunate tracing artifact happens under XRay instrumentation; funtrace2viz mostly recovers the flow despite this. When this bit and the previous bit are both set, this is a `CATCH` event, and the code pointer points into the function that caught the exception.
352 |   * `CALL_RETURNING_UPON_THROW` (61): marks call events that will have a return event logged for them if an exception is thrown. Under most instrumentation methods this does not happen and so funtrace2viz guesses which functions effectively returned during stack unwinding. When it sees a call entry with this flag set, it knows that this function wouldn't return without logging a return event even if an exception was thrown, which prevents it from wrongly guessing that the function returned due to unwinding.
353 | * **`FTRACETX`**: a variable-sized chunk containing textual ftrace data (one event per line - what you read from `/sys/kernel/tracing/trace_pipe`). The timestamps in this data and the trace entries from `TRACEBUF` are from the same time source.
354 | * **`ENDTRACE`**: an zero-sized chunk marking the end of a snapshot.
355 | 


--------------------------------------------------------------------------------