├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE_1_0.txt ├── README.md ├── examples ├── explicit.rs ├── implicit.rs └── noguard.rs └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | sudo: required 3 | rust: 4 | - nightly 5 | - beta 6 | before_script: 7 | - pip install 'travis-cargo<0.2' --user && export PATH=$HOME/.local/bin:$PATH 8 | script: 9 | - | 10 | travis-cargo build && 11 | travis-cargo test && 12 | travis-cargo bench && 13 | travis-cargo doc 14 | after_success: 15 | - travis-cargo --only beta doc-upload 16 | - travis-cargo coveralls 17 | env: 18 | global: 19 | - secure: cpewKWIxIogX0DDZZajvlQkXR29Mc2JHLMzNQjO1FsVA67d5fsYlhGCIzF4eb32Yz73IpNFi3yKAdDfNYIVoYwWAYDRwJX5+noV3uQy5yhPRo5c5XYEYXXrYQkqaXArShvSw2Aq+Q94jK3rKT0Q4XaL7jwjDzvzY7dTBWlpBvk0= 20 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hprof" 3 | version = "0.1.3" 4 | authors = ["Corey Richardson "] 5 | description = "A simple hierarchical profiler" 6 | documentation = "https://cmr.github.io/hprof" 7 | repository = "https://github.com/cmr/hprof" 8 | readme = "README.md" 9 | license = "BSL-1.0" 10 | 11 | [dependencies] 12 | clock_ticks = "0.1.0" 13 | log = "0.3.4" 14 | 15 | [features] 16 | unstable = [] 17 | -------------------------------------------------------------------------------- /LICENSE_1_0.txt: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person or organization 2 | obtaining a copy of the software and accompanying documentation covered by 3 | this license (the "Software") to use, reproduce, display, distribute, execute, 4 | and transmit the Software, and to prepare derivative works of the Software, 5 | and to permit third-parties to whom the Software is furnished to do so, all 6 | subject to the following: 7 | 8 | The copyright notices in the Software and this entire statement, including the 9 | above license grant, this restriction and the following disclaimer, must be 10 | included in all copies of the Software, in whole or in part, and all 11 | derivative works of the Software, unless such copies or derivative works are 12 | solely in the form of machine-executable object code generated by a source 13 | language processor. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 18 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR 19 | ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `hprof`, a real-time hierarchical profiler 2 | 3 | [![Travis](https://img.shields.io/travis/cmr/hprof.svg?style=flat-square)](https://travis-ci.org/cmr/hprof) 4 | [![Crates.io](https://img.shields.io/crates/v/hprof.svg?style=flat-square)](https://crates.io/crates/hprof) 5 | 6 | [Documentation](https://cmr.github.io/hprof) 7 | 8 | `hprof` is suitable only for getting rough measurements of "systems", rather 9 | than fine-tuned profiling data. Consider using `perf`, `SystemTap`, `DTrace`, 10 | `VTune`, etc for more detailed profiling. 11 | 12 | # What is hierarchical profiling? 13 | 14 | Hierarchical profiling is based on the observation that games are typically 15 | organized into a "tree" of behavior. You have an AI system that does path 16 | planning, making tactical decisions, etc. You have a physics system that does 17 | collision detection, rigid body dynamics, etc. A tree might look like: 18 | 19 | - Physics 20 | - Collision detection 21 | - Broad phase 22 | - Narrow phase 23 | - Fluid simulation 24 | - Rigid body simulation 25 | - Collision resolution 26 | - Update positions 27 | - AI 28 | - Path planning 29 | - Combat tactics 30 | - Build queue maintenance 31 | - Render 32 | - Frustum culling 33 | - Draw call sorting 34 | - Draw call submission 35 | - GPU wait 36 | 37 | A hierarchical profiler will annotate this tree with how much time each step 38 | took. This is an extension of timer-based profiling, where a timer is used to 39 | measure how long a block of code takes to execute. Rather than coding up a 40 | one-time timer, you merely call `Profiler::enter("description of thing")` and 41 | a new entry will be made in the profile tree. 42 | 43 | The idea came from a 2002 article in Game Programming Gems 3, "Real-Time 44 | Hierarchical Profiling" by Greg Hjelstrom and Byon Garrabrant from Westwood 45 | Studios. They report having thousands of profile nodes active at a time. 46 | 47 | # License 48 | 49 | 50 | This software is licensed under the [Boost Software 51 | License](http://www.boost.org/users/license.html). In short, you are free to 52 | use, modify, and redistribute in any form without attribution. 53 | 54 | # Example Output 55 | 56 | ``` 57 | Timing information for main loop: 58 | setup - 1133523ns (6.725068%) 59 | physics - 2258292ns (13.3982%) 60 | collision - 1140731ns (50.512998%) 61 | update positions - 1108782ns (49.098257%) 62 | render - 13446767ns (79.778204%) 63 | cull - 1134725ns (8.438646%) 64 | gpu submit - 2197346ns (16.341073%) 65 | gpu wait - 10088879ns (75.028287%) 66 | ``` 67 | -------------------------------------------------------------------------------- /examples/explicit.rs: -------------------------------------------------------------------------------- 1 | extern crate hprof; 2 | 3 | fn main() { 4 | let p = hprof::Profiler::new("main loop"); 5 | 6 | loop { 7 | p.start_frame(); 8 | 9 | { 10 | let _g = p.enter("setup"); 11 | std::thread::sleep_ms(1); 12 | } 13 | { 14 | let _g = p.enter("physics"); 15 | 16 | let _g = p.enter("collision"); 17 | std::thread::sleep_ms(1); 18 | drop(_g); 19 | 20 | let _g = p.enter("update positions"); 21 | std::thread::sleep_ms(1); 22 | drop(_g); 23 | } 24 | { 25 | let _g = p.enter("render"); 26 | 27 | let _g = p.enter("cull"); 28 | std::thread::sleep_ms(1); 29 | drop(_g); 30 | 31 | let _g = p.enter("gpu submit"); 32 | std::thread::sleep_ms(2); 33 | drop(_g); 34 | 35 | let _g = p.enter("gpu wait"); 36 | std::thread::sleep_ms(10); 37 | } 38 | 39 | p.end_frame(); 40 | 41 | // this would usually depend on a debug flag, or use custom functionality for drawing the 42 | // debug information. 43 | if true { 44 | p.print_timing(); 45 | } 46 | break; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/implicit.rs: -------------------------------------------------------------------------------- 1 | extern crate hprof; 2 | 3 | fn main() { 4 | loop { 5 | hprof::start_frame(); 6 | 7 | { 8 | let _g = hprof::enter("setup"); 9 | std::thread::sleep_ms(1); 10 | } 11 | { 12 | let _g = hprof::enter("physics"); 13 | 14 | let _g = hprof::enter("collision"); 15 | std::thread::sleep_ms(1); 16 | drop(_g); 17 | 18 | let _g = hprof::enter("update positions"); 19 | std::thread::sleep_ms(1); 20 | drop(_g); 21 | } 22 | { 23 | let _g = hprof::enter("render"); 24 | 25 | let _g = hprof::enter("cull"); 26 | std::thread::sleep_ms(1); 27 | drop(_g); 28 | 29 | let _g = hprof::enter("gpu submit"); 30 | std::thread::sleep_ms(2); 31 | drop(_g); 32 | 33 | let _g = hprof::enter("gpu wait"); 34 | std::thread::sleep_ms(10); 35 | drop(_g); 36 | } 37 | 38 | hprof::end_frame(); 39 | 40 | // this would usually depend on a debug flag, or use custom functionality for drawing the 41 | // debug information. 42 | if true { 43 | hprof::profiler().print_timing(); 44 | } 45 | break; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /examples/noguard.rs: -------------------------------------------------------------------------------- 1 | extern crate hprof; 2 | 3 | fn main() { 4 | let p = hprof::Profiler::new("main loop"); 5 | 6 | loop { 7 | p.start_frame(); 8 | 9 | { 10 | p.enter_noguard("setup"); 11 | std::thread::sleep_ms(1); 12 | p.leave(); 13 | } 14 | { 15 | p.enter_noguard("physics"); 16 | 17 | p.enter_noguard("collision"); 18 | std::thread::sleep_ms(1); 19 | p.leave(); 20 | 21 | p.enter_noguard("update positions"); 22 | std::thread::sleep_ms(1); 23 | p.leave(); 24 | 25 | p.leave(); 26 | } 27 | { 28 | p.enter_noguard("render"); 29 | 30 | p.enter_noguard("cull"); 31 | std::thread::sleep_ms(1); 32 | p.leave(); 33 | 34 | p.enter_noguard("gpu submit"); 35 | std::thread::sleep_ms(2); 36 | p.leave(); 37 | 38 | p.enter_noguard("gpu wait"); 39 | std::thread::sleep_ms(10); 40 | p.leave(); 41 | 42 | p.leave(); 43 | } 44 | 45 | p.end_frame(); 46 | 47 | // this would usually depend on a debug flag, or use custom functionality for drawing the 48 | // debug information. 49 | if true { 50 | p.print_timing(); 51 | } 52 | break; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright Corey Richardson 2015 2 | // Distributed under the Boost Software License, Version 1.0. 3 | // (See accompanying file LICENSE_1_0.txt or copy at 4 | // http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | //! A real-time hierarchical profiler. 7 | //! 8 | //! # What is hierarchical profiling? 9 | //! 10 | //! Hierarchical profiling is based on the observation that games are typically 11 | //! organized into a "tree" of behavior. You have an AI system that does path 12 | //! planning, making tactical decisions, etc. You have a physics system that does 13 | //! collision detection, rigid body dynamics, etc. A tree might look like: 14 | //! 15 | //! - Physics 16 | //! - Collision detection 17 | //! - Broad phase 18 | //! - Narrow phase 19 | //! - Fluid simulation 20 | //! - Rigid body simulation 21 | //! - Collision resolution 22 | //! - Update positions 23 | //! - AI 24 | //! - Path planning 25 | //! - Combat tactics 26 | //! - Build queue maintenance 27 | //! - Render 28 | //! - Frustum culling 29 | //! - Draw call sorting 30 | //! - Draw call submission 31 | //! - GPU wait 32 | //! 33 | //! A hierarchical profiler will annotate this tree with how much time each step 34 | //! took. This is an extension of timer-based profiling, where a timer is used to 35 | //! measure how long a block of code takes to execute. Rather than coding up a 36 | //! one-time timer, you merely call `Profiler::enter("description of thing")` and 37 | //! a new entry will be made in the profile tree. 38 | //! 39 | //! The idea came from a 2002 article in Game Programming Gems 3, "Real-Time 40 | //! Hierarchical Profiling" by Greg Hjelstrom and Byon Garrabrant from Westwood 41 | //! Studios. They report having thousands of profile nodes active at a time. 42 | //! 43 | //! There are two major ways to use this library: with explicit profilers, and with an implicit 44 | //! profiler. 45 | //! 46 | //! # Implicit (thread-local) profiler 47 | //! 48 | //! To use the implicit profiler, call `hprof::start_frame()`, `hprof::end_frame()`, and 49 | //! `hprof::enter("name")`. Destructors will take care of the rest. You can access the profiler 50 | //! using `hprof::profiler()`. 51 | //! 52 | //! # Explicit profilers 53 | //! 54 | //! Use `Profiler::new()` and pass it around/store it somewhere (for example, using 55 | //! [`current`](https://github.com/PistonDevelopers/current)). 56 | 57 | #[macro_use] 58 | extern crate log; 59 | extern crate clock_ticks; 60 | 61 | use std::cell::{Cell, RefCell}; 62 | use std::rc::Rc; 63 | 64 | thread_local!(static HPROF: Profiler = Profiler::new("root profiler")); 65 | 66 | /// A single tree of profile data. 67 | pub struct Profiler { 68 | root: Rc, 69 | current: RefCell>, 70 | enabled: Cell, 71 | } 72 | 73 | /// A "guard" for calling `Profiler::leave` when it is destroyed. 74 | pub struct ProfileGuard<'a>(&'a Profiler); 75 | impl<'a> Drop for ProfileGuard<'a> { 76 | fn drop(&mut self) { 77 | self.0.leave() 78 | } 79 | } 80 | 81 | macro_rules! early_leave { 82 | ($slf:ident) => (if $slf.enabled.get() == false { return }) 83 | } 84 | 85 | impl Profiler { 86 | /// Create a new profiler with the given name for the root node. 87 | pub fn new(name: &'static str) -> Profiler { 88 | let root = Rc::new(ProfileNode::new(None, name)); 89 | root.call(); 90 | Profiler { root: root.clone(), current: RefCell::new(root), enabled: Cell::new(true) } 91 | } 92 | 93 | /// Enter a profile node for `name`, returning a guard object that will `leave` on destruction. 94 | pub fn enter(&self, name: &'static str) -> ProfileGuard { 95 | self.enter_noguard(name); 96 | ProfileGuard(self) 97 | } 98 | 99 | /// Enter a profile node for `name`. 100 | pub fn enter_noguard(&self, name: &'static str) { 101 | early_leave!(self); 102 | { 103 | let mut curr = self.current.borrow_mut(); 104 | if curr.name != name { 105 | *curr = curr.make_child(curr.clone(), name); 106 | } 107 | } 108 | self.current.borrow().call(); 109 | } 110 | 111 | /// Leave the current profile node. 112 | pub fn leave(&self) { 113 | early_leave!(self); 114 | let mut curr = self.current.borrow_mut(); 115 | if curr.ret() == true { 116 | if let Some(parent) = curr.parent.clone() { 117 | *curr = parent; 118 | } 119 | } 120 | } 121 | 122 | /// Print out the current timing information in a very naive way. 123 | pub fn print_timing(&self) { 124 | println!("Timing information for {}:", self.root.name); 125 | for child in &*self.root.children.borrow() { 126 | child.print(2); 127 | } 128 | } 129 | 130 | /// Return the root profile node for inspection. 131 | /// 132 | /// This root will always be valid and reflect the current state of the `Profiler`. 133 | /// It is not advised to inspect the data between calls to `start_frame` and `end_frame`. 134 | pub fn root(&self) -> Rc { 135 | self.root.clone() 136 | } 137 | 138 | /// Finish a frame. 139 | /// 140 | /// Logs an error if there are pending `leave` calls, and later attempts to 141 | /// print timing data will be met with sadness in the form of `NaN`s. 142 | pub fn end_frame(&self) { 143 | early_leave!(self); 144 | if &*self.root as *const ProfileNode as usize != &**self.current.borrow() as *const ProfileNode as usize { 145 | error!("Pending `leave` calls on Profiler::frame"); 146 | } else { 147 | self.root.ret(); 148 | } 149 | } 150 | 151 | /// Start a frame. 152 | /// 153 | /// Resets timing data. Logs an error if there are pending `leave` calls, but there are 154 | /// otherwise no ill effects. 155 | pub fn start_frame(&self) { 156 | early_leave!(self); 157 | if &*self.root as *const ProfileNode as usize != &**self.current.borrow() as *const ProfileNode as usize { 158 | error!("Pending `leave` calls on Profiler::frame"); 159 | } 160 | *self.current.borrow_mut() = self.root.clone(); 161 | self.root.reset(); 162 | self.root.call(); 163 | } 164 | 165 | /// Disable the profiler. 166 | /// 167 | /// All calls until `enable` will do nothing. 168 | pub fn disable(&self) { 169 | self.enabled.set(false); 170 | } 171 | 172 | /// Enable the profiler. 173 | /// 174 | /// Calls will take effect until `disable` is called. 175 | pub fn enable(&self) { 176 | self.enabled.set(true); 177 | } 178 | 179 | /// Toggle the profiler enabledness. 180 | pub fn toggle(&self) { 181 | self.enabled.set(!self.enabled.get()); 182 | } 183 | 184 | } 185 | 186 | /// A single node in the profile tree. 187 | /// 188 | /// *NOTE*: While the fields are public and are a cell, it is not advisable to modify them. 189 | pub struct ProfileNode { 190 | pub name: &'static str, 191 | /// Number of calls made to this node. 192 | pub calls: Cell, 193 | /// Total time in ns used by this node and all of its children. 194 | /// 195 | /// Computed after the last pending `ret`. 196 | pub total_time: Cell, 197 | /// Timestamp in ns when the first `call` was made to this node. 198 | pub start_time: Cell, 199 | /// Number of recursive calls made to this node since the first `call`. 200 | pub recursion: Cell, 201 | /// Parent in the profile tree. 202 | pub parent: Option>, 203 | // TODO: replace this Vec with an intrusive list. Use containerof? 204 | /// Child nodes. 205 | pub children: RefCell>>, 206 | } 207 | 208 | impl ProfileNode { 209 | pub fn new(parent: Option>, name: &'static str) -> ProfileNode { 210 | ProfileNode { 211 | name: name, 212 | calls: Cell::new(0), 213 | total_time: Cell::new(0), 214 | start_time: Cell::new(0), 215 | recursion: Cell::new(0), 216 | parent: parent, 217 | children: RefCell::new(Vec::new()) 218 | } 219 | } 220 | 221 | /// Reset this node and its children, seting relevant fields to 0. 222 | pub fn reset(&self) { 223 | self.calls.set(0); 224 | self.total_time.set(0); 225 | self.start_time.set(0); 226 | self.recursion.set(0); 227 | for child in &*self.children.borrow() { 228 | child.reset() 229 | } 230 | } 231 | 232 | /// Create a child named `name`. 233 | pub fn make_child(&self, me: Rc, name: &'static str) -> Rc { 234 | let mut children = self.children.borrow_mut(); 235 | for child in &*children { 236 | if child.name == name { 237 | return child.clone() 238 | } 239 | } 240 | let new = Rc::new(ProfileNode::new(Some(me), name)); 241 | children.push(new.clone()); 242 | new 243 | } 244 | 245 | /// Enter this profile node. 246 | pub fn call(&self) { 247 | self.calls.set(self.calls.get() + 1); 248 | let rec = self.recursion.get(); 249 | if rec == 0 { 250 | self.start_time.set(clock_ticks::precise_time_ns()); 251 | } 252 | self.recursion.set(rec + 1); 253 | } 254 | 255 | /// Return from this profile node, returning true if there are no pending recursive calls. 256 | pub fn ret(&self) -> bool { 257 | let rec = self.recursion.get(); 258 | if rec == 1 { 259 | let time = clock_ticks::precise_time_ns(); 260 | let durr = time - self.start_time.get(); 261 | self.total_time.set(self.total_time.get() + durr); 262 | } 263 | self.recursion.set(rec - 1); 264 | rec == 1 265 | } 266 | 267 | /// Print out the current timing information in a very naive way. 268 | /// 269 | /// Uses `indent` to determine how deep to indent the line. 270 | pub fn print(&self, indent: u32) { 271 | for _ in 0..indent { 272 | print!(" "); 273 | } 274 | let parent_time = self.parent 275 | .as_ref() 276 | .map(|p| p.total_time.get()) 277 | .unwrap_or(self.total_time.get()) as f64; 278 | let percent = 100.0 * (self.total_time.get() as f64 / parent_time); 279 | if percent.is_infinite() { 280 | println!("{name} - {calls} * {each} = {total} @ {hz:.1}hz", 281 | name = self.name, 282 | calls = self.calls.get(), 283 | each = Nanoseconds((self.total_time.get() as f64 / self.calls.get() as f64) as u64), 284 | total = Nanoseconds(self.total_time.get()), 285 | hz = self.calls.get() as f64 / self.total_time.get() as f64 * 1e9f64 286 | ); 287 | } else { 288 | println!("{name} - {calls} * {each} = {total} ({percent:.1}%)", 289 | name = self.name, 290 | calls = self.calls.get(), 291 | each = Nanoseconds((self.total_time.get() as f64 / self.calls.get() as f64) as u64), 292 | total = Nanoseconds(self.total_time.get()), 293 | percent = percent 294 | ); 295 | } 296 | for c in &*self.children.borrow() { 297 | c.print(indent+2); 298 | } 299 | } 300 | } 301 | 302 | pub fn profiler() -> &'static Profiler { 303 | HPROF.with(|p| unsafe { std::mem::transmute(p) } ) 304 | } 305 | 306 | pub fn enter(name: &'static str) -> ProfileGuard<'static> { 307 | HPROF.with(|p| unsafe { std::mem::transmute::<_, &'static Profiler>(p) }.enter(name) ) 308 | } 309 | 310 | pub fn start_frame() { 311 | HPROF.with(|p| p.start_frame()) 312 | } 313 | 314 | pub fn end_frame() { 315 | HPROF.with(|p| p.end_frame()) 316 | } 317 | 318 | // used to do a pretty printing of time 319 | struct Nanoseconds(u64); 320 | 321 | impl std::fmt::Display for Nanoseconds { 322 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 323 | if self.0 < 1_000 { 324 | write!(f, "{}ns", self.0) 325 | } else if self.0 < 1_000_000 { 326 | write!(f, "{:.1}us", self.0 as f64 / 1_000.) 327 | } else if self.0 < 1_000_000_000 { 328 | write!(f, "{:.1}ms", self.0 as f64 / 1_000_000.) 329 | } else { 330 | write!(f, "{:.1}s", self.0 as f64 / 1_000_000_000.) 331 | } 332 | } 333 | } 334 | --------------------------------------------------------------------------------