| 1 | //! Profiling counters and their implementation. | 
| 2 | //! | 
|---|
| 3 | //! # Available counters | 
|---|
| 4 | //! | 
|---|
| 5 | //! Name (for [`Counter::by_name()`]) | Counter                      | OSes  | CPUs | 
|---|
| 6 | //! --------------------------------- | -------                      | ----  | ---- | 
|---|
| 7 | //! `wall-time`                       | [`WallTime`]                 | any   | any | 
|---|
| 8 | //! `instructions:u`                  | [`Instructions`]             | Linux | `x86_64` | 
|---|
| 9 | //! `instructions-minus-irqs:u`       | [`InstructionsMinusIrqs`]    | Linux | `x86_64`<br>- AMD (since K8)<br>- Intel (since Sandy Bridge) | 
|---|
| 10 | //! `instructions-minus-r0420:u`      | [`InstructionsMinusRaw0420`] | Linux | `x86_64`<br>- AMD (Zen) | 
|---|
| 11 | //! | 
|---|
| 12 | //! *Note: `:u` suffixes for hardware performance counters come from the Linux `perf` | 
|---|
| 13 | //! tool, and indicate that the counter is only active while userspace code executes | 
|---|
| 14 | //! (i.e. it's paused while the kernel handles syscalls, interrupts, etc.).* | 
|---|
| 15 | //! | 
|---|
| 16 | //! # Limitations and caveats | 
|---|
| 17 | //! | 
|---|
| 18 | //! *Note: for more information, also see the GitHub PR which first implemented hardware | 
|---|
| 19 | //! performance counter support ([#143](https://github.com/rust-lang/measureme/pull/143)).* | 
|---|
| 20 | //! | 
|---|
| 21 | //! The hardware performance counters (i.e. all counters other than `wall-time`) are limited to: | 
|---|
| 22 | //! * Linux, for out-of-the-box performance counter reads from userspace | 
|---|
| 23 | //!   * other OSes could work through custom kernel extensions/drivers, in the future | 
|---|
| 24 | //! * `x86_64` CPUs, mostly due to lack of other available test hardware | 
|---|
| 25 | //!   * new architectures would be easier to support (on Linux) than new OSes | 
|---|
| 26 | //!   * easiest to add would be 32-bit `x86` (aka `i686`), which would reuse | 
|---|
| 27 | //!     most of the `x86_64` CPU model detection logic | 
|---|
| 28 | //! * specific (newer) CPU models, for certain non-standard counters | 
|---|
| 29 | //!   * e.g. `instructions-minus-irqs:u` requires a "hardware interrupts" (aka "IRQs") | 
|---|
| 30 | //!     counter, which is implemented differently between vendors / models (if at all) | 
|---|
| 31 | //! * single-threaded programs (counters only work on the thread they were created on) | 
|---|
| 32 | //!   * for profiling `rustc`, this means only "check mode" (`--emit=metadata`), | 
|---|
| 33 | //!     is supported currently (`-Z no-llvm-threads` could also work) | 
|---|
| 34 | //!   * unclear what the best approach for handling multiple threads would be | 
|---|
| 35 | //!   * changing the API (e.g. to require per-thread profiler handles) could result | 
|---|
| 36 | //!     in a more efficient implementation, but would also be less ergonomic | 
|---|
| 37 | //!   * profiling data from multithreaded programs would be harder to use due to | 
|---|
| 38 | //!     noise from synchronization mechanisms, non-deterministic work-stealing, etc. | 
|---|
| 39 | //! | 
|---|
| 40 | //! For ergonomic reasons, the public API doesn't vary based on `features` or target. | 
|---|
| 41 | //! Instead, attempting to create any unsupported counter will return `Err`, just | 
|---|
| 42 | //! like it does for any issue detected at runtime (e.g. incompatible CPU model). | 
|---|
| 43 | //! | 
|---|
| 44 | //! When counting instructions specifically, these factors will impact the profiling quality: | 
|---|
| 45 | //! * high-level non-determinism (e.g. user interactions, networking) | 
|---|
| 46 | //!   * the ideal use-case is a mostly-deterministic program, e.g. a compiler like `rustc` | 
|---|
| 47 | //!   * if I/O can be isolated to separate profiling events, and doesn't impact | 
|---|
| 48 | //!     execution in a more subtle way (see below), the deterministic parts of | 
|---|
| 49 | //!     the program can still be profiled with high accuracy | 
|---|
| 50 | //!   * intentional uses of randomness may change execution paths, though for | 
|---|
| 51 | //!     cryptographic operations specifically, "constant time" implementations | 
|---|
| 52 | //!     are preferred / necessary (in order to limit an external observer's | 
|---|
| 53 | //!     ability to infer secrets), so they're not as much of a problem | 
|---|
| 54 | //!   * even otherwise-deterministic machine-local communication (to e.g. system | 
|---|
| 55 | //!     services or drivers) can behave unpredictably (especially under load) | 
|---|
| 56 | //!     * while we haven't observed this in the wild yet, it's possible for | 
|---|
| 57 | //!       file reads/writes to be split up into multiple smaller chunks | 
|---|
| 58 | //!       (and therefore take more userspace instructions to fully read/write) | 
|---|
| 59 | //! * low-level non-determinism (e.g. ASLR, randomized `HashMap`s, timers) | 
|---|
| 60 | //!   * ASLR ("Address Space Layout Randomization"), may be provided by the OS for | 
|---|
| 61 | //!     security reasons, or accidentally caused through allocations that depend on | 
|---|
| 62 | //!     random data (even as low-entropy as e.g. the base 10 length of a process ID) | 
|---|
| 63 | //!   * on Linux ASLR can be disabled by running the process under `setarch -R` | 
|---|
| 64 | //!   * this impacts `rustc` and LLVM, which rely on keying `HashMap`s by addresses | 
|---|
| 65 | //!     (typically of interned data) as an optimization, and while non-determinstic | 
|---|
| 66 | //!     outputs are considered bugs, the instructions executed can still vary a lot, | 
|---|
| 67 | //!     even when the externally observable behavior is perfectly repeatable | 
|---|
| 68 | //!   * `HashMap`s are involved in one more than one way: | 
|---|
| 69 | //!     * both the executed instructions, and the shape of the allocations depend | 
|---|
| 70 | //!       on both the hasher state and choice of keys (as the buckets are in | 
|---|
| 71 | //!       a flat array indexed by some of the lower bits of the key hashes) | 
|---|
| 72 | //!     * so every `HashMap` with keys being/containing addresses will amplify | 
|---|
| 73 | //!       ASLR and ASLR-like effects, making the entire program more sensitive | 
|---|
| 74 | //!     * the default hasher is randomized, and while `rustc` doesn't use it, | 
|---|
| 75 | //!       proc macros can (and will), and it's harder to disable than Linux ASLR | 
|---|
| 76 | //!   * most ways of measuring time will inherently never perfectly align with | 
|---|
| 77 | //!     exact points in the program's execution, making time behave like another | 
|---|
| 78 | //!     low-entropy source of randomness - this also means timers will elapse at | 
|---|
| 79 | //!     unpredictable points (which can further impact the rest of the execution) | 
|---|
| 80 | //!     * this includes the common thread scheduler technique of preempting the | 
|---|
| 81 | //!       currently executing thread with a periodic timer interrupt, so the exact | 
|---|
| 82 | //!       interleaving of multiple threads will likely not be reproducible without | 
|---|
| 83 | //!       special OS configuration, or tools that emulate a deterministic scheduler | 
|---|
| 84 | //!     * `jemalloc` (the allocator used by `rustc`, at least in official releases) | 
|---|
| 85 | //!       has a 10 second "purge timer", which can introduce an ASLR-like effect, | 
|---|
| 86 | //!       unless disabled with `MALLOC_CONF=dirty_decay_ms:0,muzzy_decay_ms:0` | 
|---|
| 87 | //! * hardware flaws (whether in the design or implementation) | 
|---|
| 88 | //!   * hardware interrupts ("IRQs") and exceptions (like page faults) cause | 
|---|
| 89 | //!     overcounting (1 instruction per interrupt, possibly the `iret` from the | 
|---|
| 90 | //!     kernel handler back to the interrupted userspace program) | 
|---|
| 91 | //!     * this is the reason why `instructions-minus-irqs:u` should be preferred | 
|---|
| 92 | //!       to `instructions:u`, where the former is available | 
|---|
| 93 | //!     * there are system-wide options (e.g. `CONFIG_NO_HZ_FULL`) for removing | 
|---|
| 94 | //!       some interrupts from the cores used for profiling, but they're not as | 
|---|
| 95 | //!       complete of a solution, nor easy to set up in the first place | 
|---|
| 96 | //!   * AMD Zen CPUs have a speculative execution feature (dubbed `SpecLockMap`), | 
|---|
| 97 | //!     which can cause non-deterministic overcounting for instructions following | 
|---|
| 98 | //!     an atomic instruction (such as found in heap allocators, or `measureme`) | 
|---|
| 99 | //!     * this is automatically detected, with a `log` message pointing the user | 
|---|
| 100 | //!       to <https://github.com/mozilla/rr/wiki/Zen> for guidance on how to | 
|---|
| 101 | //!       disable `SpecLockMap` on their system (sadly requires root access) | 
|---|
| 102 | //! | 
|---|
| 103 | //! Even if some of the above caveats apply for some profiling setup, as long as | 
|---|
| 104 | //! the counters function, they can still be used, and compared with `wall-time`. | 
|---|
| 105 | //! Chances are, they will still have less variance, as everything that impacts | 
|---|
| 106 | //! instruction counts will also impact any time measurements. | 
|---|
| 107 | //! | 
|---|
| 108 | //! Also keep in mind that instruction counts do not properly reflect all kinds | 
|---|
| 109 | //! of workloads, e.g. SIMD throughput and cache locality are unaccounted for. | 
|---|
| 110 |  | 
|---|
| 111 | // FIXME: Use a cargo feature for accurate_seqlock_rdpmc and unserialized_rdpmc | 
|---|
| 112 | //        so we don't need this: | 
|---|
| 113 | #![ allow(unexpected_cfgs)] | 
|---|
| 114 |  | 
|---|
| 115 | use std::error::Error; | 
|---|
| 116 | use std::time::Instant; | 
|---|
| 117 |  | 
|---|
| 118 | // HACK(eddyb) this is semantically `warn!` but uses `error!` because | 
|---|
| 119 | // that's the only log level enabled by default - see also | 
|---|
| 120 | // https://github.com/rust-lang/rust/issues/76824 | 
|---|
| 121 | macro_rules! really_warn { | 
|---|
| 122 | ($msg:literal $($rest:tt)*) => { | 
|---|
| 123 | error!(concat!( "[WARNING] ", $msg) $($rest)*) | 
|---|
| 124 | } | 
|---|
| 125 | } | 
|---|
| 126 |  | 
|---|
| 127 | pub enum Counter { | 
|---|
| 128 | WallTime(WallTime), | 
|---|
| 129 | Instructions(Instructions), | 
|---|
| 130 | InstructionsMinusIrqs(InstructionsMinusIrqs), | 
|---|
| 131 | InstructionsMinusRaw0420(InstructionsMinusRaw0420), | 
|---|
| 132 | } | 
|---|
| 133 |  | 
|---|
| 134 | impl Counter { | 
|---|
| 135 | pub fn by_name(name: &str) -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 136 | Ok(match name { | 
|---|
| 137 | WallTime::NAME => Counter::WallTime(WallTime::new()), | 
|---|
| 138 | Instructions::NAME => Counter::Instructions(Instructions::new()?), | 
|---|
| 139 | InstructionsMinusIrqs::NAME => { | 
|---|
| 140 | Counter::InstructionsMinusIrqs(InstructionsMinusIrqs::new()?) | 
|---|
| 141 | } | 
|---|
| 142 | InstructionsMinusRaw0420::NAME => { | 
|---|
| 143 | Counter::InstructionsMinusRaw0420(InstructionsMinusRaw0420::new()?) | 
|---|
| 144 | } | 
|---|
| 145 | _ => return Err(format!( "{:?}  is not a valid counter name", name).into()), | 
|---|
| 146 | }) | 
|---|
| 147 | } | 
|---|
| 148 |  | 
|---|
| 149 | pub(super) fn describe_as_json(&self) -> String { | 
|---|
| 150 | let (name, units) = match self { | 
|---|
| 151 | Counter::WallTime(_) => ( | 
|---|
| 152 | WallTime::NAME, | 
|---|
| 153 | r#"[["ns", 1], ["μs", 1000], ["ms", 1000000], ["s", 1000000000]]"#, | 
|---|
| 154 | ), | 
|---|
| 155 | Counter::Instructions(_) => (Instructions::NAME, r#"[["instructions", 1]]"#), | 
|---|
| 156 | Counter::InstructionsMinusIrqs(_) => { | 
|---|
| 157 | (InstructionsMinusIrqs::NAME, r#"[["instructions", 1]]"#) | 
|---|
| 158 | } | 
|---|
| 159 | Counter::InstructionsMinusRaw0420(_) => { | 
|---|
| 160 | (InstructionsMinusRaw0420::NAME, r#"[["instructions", 1]]"#) | 
|---|
| 161 | } | 
|---|
| 162 | }; | 
|---|
| 163 | format!( r#"{{  "name": "{} ", "units": {}  }} "#, name, units) | 
|---|
| 164 | } | 
|---|
| 165 |  | 
|---|
| 166 | #[ inline] | 
|---|
| 167 | pub(super) fn since_start(&self) -> u64 { | 
|---|
| 168 | match self { | 
|---|
| 169 | Counter::WallTime(counter) => counter.since_start(), | 
|---|
| 170 | Counter::Instructions(counter) => counter.since_start(), | 
|---|
| 171 | Counter::InstructionsMinusIrqs(counter) => counter.since_start(), | 
|---|
| 172 | Counter::InstructionsMinusRaw0420(counter) => counter.since_start(), | 
|---|
| 173 | } | 
|---|
| 174 | } | 
|---|
| 175 | } | 
|---|
| 176 |  | 
|---|
| 177 | /// "Monotonic clock" with nanosecond precision (using [`std::time::Instant`]). | 
|---|
| 178 | /// | 
|---|
| 179 | /// Can be obtained with `Counter::by_name("wall-time")`. | 
|---|
| 180 | pub struct WallTime { | 
|---|
| 181 | start: Instant, | 
|---|
| 182 | } | 
|---|
| 183 |  | 
|---|
| 184 | impl WallTime { | 
|---|
| 185 | const NAME: &'static str = "wall-time"; | 
|---|
| 186 |  | 
|---|
| 187 | pub fn new() -> Self { | 
|---|
| 188 | WallTime { | 
|---|
| 189 | start: Instant::now(), | 
|---|
| 190 | } | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | #[ inline] | 
|---|
| 194 | fn since_start(&self) -> u64 { | 
|---|
| 195 | self.start.elapsed().as_nanos() as u64 | 
|---|
| 196 | } | 
|---|
| 197 | } | 
|---|
| 198 |  | 
|---|
| 199 | /// "Instructions retired" hardware performance counter (userspace-only). | 
|---|
| 200 | /// | 
|---|
| 201 | /// Can be obtained with `Counter::by_name("instructions:u")`. | 
|---|
| 202 | pub struct Instructions { | 
|---|
| 203 | instructions: hw::Counter, | 
|---|
| 204 | start: u64, | 
|---|
| 205 | } | 
|---|
| 206 |  | 
|---|
| 207 | impl Instructions { | 
|---|
| 208 | const NAME: &'static str = "instructions:u"; | 
|---|
| 209 |  | 
|---|
| 210 | pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 211 | let model: CpuModel = hw::CpuModel::detect()?; | 
|---|
| 212 | let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?; | 
|---|
| 213 | let start: u64 = instructions.read(); | 
|---|
| 214 | Ok(Instructions { | 
|---|
| 215 | instructions, | 
|---|
| 216 | start, | 
|---|
| 217 | }) | 
|---|
| 218 | } | 
|---|
| 219 |  | 
|---|
| 220 | #[ inline] | 
|---|
| 221 | fn since_start(&self) -> u64 { | 
|---|
| 222 | self.instructions.read().wrapping_sub(self.start) | 
|---|
| 223 | } | 
|---|
| 224 | } | 
|---|
| 225 |  | 
|---|
| 226 | /// More accurate [`Instructions`] (subtracting hardware interrupt counts). | 
|---|
| 227 | /// | 
|---|
| 228 | /// Can be obtained with `Counter::by_name("instructions-minus-irqs:u")`. | 
|---|
| 229 | pub struct InstructionsMinusIrqs { | 
|---|
| 230 | instructions: hw::Counter, | 
|---|
| 231 | irqs: hw::Counter, | 
|---|
| 232 | start: u64, | 
|---|
| 233 | } | 
|---|
| 234 |  | 
|---|
| 235 | impl InstructionsMinusIrqs { | 
|---|
| 236 | const NAME: &'static str = "instructions-minus-irqs:u"; | 
|---|
| 237 |  | 
|---|
| 238 | pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 239 | let model: CpuModel = hw::CpuModel::detect()?; | 
|---|
| 240 | let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?; | 
|---|
| 241 | let irqs: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Irqs)?; | 
|---|
| 242 | let (start_instructions: u64, start_irqs: u64) = (&instructions, &irqs).read(); | 
|---|
| 243 | let start: u64 = start_instructions.wrapping_sub(start_irqs); | 
|---|
| 244 | Ok(InstructionsMinusIrqs { | 
|---|
| 245 | instructions, | 
|---|
| 246 | irqs, | 
|---|
| 247 | start, | 
|---|
| 248 | }) | 
|---|
| 249 | } | 
|---|
| 250 |  | 
|---|
| 251 | #[ inline] | 
|---|
| 252 | fn since_start(&self) -> u64 { | 
|---|
| 253 | let (instructions: u64, irqs: u64) = (&self.instructions, &self.irqs).read(); | 
|---|
| 254 | instructions.wrapping_sub(irqs).wrapping_sub(self.start) | 
|---|
| 255 | } | 
|---|
| 256 | } | 
|---|
| 257 |  | 
|---|
| 258 | /// (Experimental) Like [`InstructionsMinusIrqs`] (but using an undocumented `r0420:u` counter). | 
|---|
| 259 | /// | 
|---|
| 260 | /// Can be obtained with `Counter::by_name("instructions-minus-r0420:u")`. | 
|---|
| 261 | // | 
|---|
| 262 | // HACK(eddyb) this is a variant of `instructions-minus-irqs:u`, where `r0420` | 
|---|
| 263 | // is subtracted, instead of the usual "hardware interrupts" (aka IRQs). | 
|---|
| 264 | // `r0420` is an undocumented counter on AMD Zen CPUs which appears to count | 
|---|
| 265 | // both hardware interrupts and exceptions (such as page faults), though | 
|---|
| 266 | // it's unclear yet what exactly it's counting (could even be `iret`s). | 
|---|
| 267 | pub struct InstructionsMinusRaw0420(InstructionsMinusIrqs); | 
|---|
| 268 |  | 
|---|
| 269 | impl InstructionsMinusRaw0420 { | 
|---|
| 270 | const NAME: &'static str = "instructions-minus-r0420:u"; | 
|---|
| 271 |  | 
|---|
| 272 | pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 273 | let model: CpuModel = hw::CpuModel::detect()?; | 
|---|
| 274 | let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?; | 
|---|
| 275 | let irqs: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Raw0420)?; | 
|---|
| 276 | let (start_instructions: u64, start_irqs: u64) = (&instructions, &irqs).read(); | 
|---|
| 277 | let start: u64 = start_instructions.wrapping_sub(start_irqs); | 
|---|
| 278 | Ok(InstructionsMinusRaw0420(InstructionsMinusIrqs { | 
|---|
| 279 | instructions, | 
|---|
| 280 | irqs, | 
|---|
| 281 | start, | 
|---|
| 282 | })) | 
|---|
| 283 | } | 
|---|
| 284 |  | 
|---|
| 285 | #[ inline] | 
|---|
| 286 | fn since_start(&self) -> u64 { | 
|---|
| 287 | self.0.since_start() | 
|---|
| 288 | } | 
|---|
| 289 | } | 
|---|
| 290 |  | 
|---|
| 291 | trait HwCounterRead { | 
|---|
| 292 | type Output; | 
|---|
| 293 | fn read(&self) -> Self::Output; | 
|---|
| 294 | } | 
|---|
| 295 |  | 
|---|
| 296 | enum HwCounterType { | 
|---|
| 297 | Instructions, | 
|---|
| 298 | Irqs, | 
|---|
| 299 | Raw0420, | 
|---|
| 300 | } | 
|---|
| 301 |  | 
|---|
| 302 | const BUG_REPORT_MSG: &str = | 
|---|
| 303 | "please report this to https://github.com/rust-lang/measureme/issues/new"; | 
|---|
| 304 |  | 
|---|
| 305 | /// Linux x86_64 implementation based on `perf_event_open` and `rdpmc`. | 
|---|
| 306 | #[ cfg(all(target_arch = "x86_64", target_os = "linux"))] | 
|---|
| 307 | mod hw { | 
|---|
| 308 | use memmap2::{Mmap, MmapOptions}; | 
|---|
| 309 | use perf_event_open_sys::{bindings::*, perf_event_open}; | 
|---|
| 310 | use std::arch::asm; | 
|---|
| 311 | use std::convert::TryInto; | 
|---|
| 312 | use std::error::Error; | 
|---|
| 313 | use std::fs; | 
|---|
| 314 | use std::mem; | 
|---|
| 315 | use std::os::unix::io::FromRawFd; | 
|---|
| 316 |  | 
|---|
| 317 | pub(super) struct Counter { | 
|---|
| 318 | mmap: Mmap, | 
|---|
| 319 | reg_idx: u32, | 
|---|
| 320 | } | 
|---|
| 321 |  | 
|---|
| 322 | impl Counter { | 
|---|
| 323 | pub(super) fn new( | 
|---|
| 324 | model: &CpuModel, | 
|---|
| 325 | counter_type: super::HwCounterType, | 
|---|
| 326 | ) -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 327 | let (type_, hw_id) = match counter_type { | 
|---|
| 328 | super::HwCounterType::Instructions => { | 
|---|
| 329 | (PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS) | 
|---|
| 330 | } | 
|---|
| 331 | super::HwCounterType::Irqs => (PERF_TYPE_RAW, model.irqs_counter_config()?), | 
|---|
| 332 | super::HwCounterType::Raw0420 => { | 
|---|
| 333 | match model { | 
|---|
| 334 | CpuModel::Amd(AmdGen::Zen) => {} | 
|---|
| 335 |  | 
|---|
| 336 | _ => really_warn!( | 
|---|
| 337 | "Counter::new: the undocumented `r0420` performance \ | 
|---|
| 338 |                              counter has only been observed on AMD Zen CPUs" | 
|---|
| 339 | ), | 
|---|
| 340 | } | 
|---|
| 341 |  | 
|---|
| 342 | (PERF_TYPE_RAW, 0x04_20) | 
|---|
| 343 | } | 
|---|
| 344 | }; | 
|---|
| 345 | Self::with_type_and_hw_id(type_, hw_id) | 
|---|
| 346 | } | 
|---|
| 347 |  | 
|---|
| 348 | fn with_type_and_hw_id( | 
|---|
| 349 | type_: perf_type_id, | 
|---|
| 350 | hw_id: u32, | 
|---|
| 351 | ) -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 352 | let mut attrs = perf_event_attr { | 
|---|
| 353 | size: mem::size_of::<perf_event_attr>().try_into().unwrap(), | 
|---|
| 354 | type_, | 
|---|
| 355 | config: hw_id.into(), | 
|---|
| 356 | ..perf_event_attr::default() | 
|---|
| 357 | }; | 
|---|
| 358 |  | 
|---|
| 359 | // Only record same-thread, any CPUs, and only userspace (no kernel/hypervisor). | 
|---|
| 360 | // NOTE(eddyb) `pid = 0`, despite talking about "process id", means | 
|---|
| 361 | // "calling process/thread", *not* "any thread in the calling process" | 
|---|
| 362 | // (i.e. "process" is interchangeable with "main thread of the process") | 
|---|
| 363 | // FIXME(eddyb) introduce per-thread counters and/or use `inherit` | 
|---|
| 364 | // (and `inherit_stat`? though they might not be appropriate here) | 
|---|
| 365 | // to be able to read the counter on more than just the initial thread. | 
|---|
| 366 | let pid = 0; | 
|---|
| 367 | let cpu = -1; | 
|---|
| 368 | let group_fd = -1; | 
|---|
| 369 | attrs.set_exclude_kernel(1); | 
|---|
| 370 | attrs.set_exclude_hv(1); | 
|---|
| 371 |  | 
|---|
| 372 | let file = unsafe { | 
|---|
| 373 | let fd = | 
|---|
| 374 | perf_event_open(&mut attrs, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC.into()); | 
|---|
| 375 | if fd < 0 { | 
|---|
| 376 | Err(std::io::Error::from_raw_os_error(-fd)) | 
|---|
| 377 | } else { | 
|---|
| 378 | Ok(fs::File::from_raw_fd(fd)) | 
|---|
| 379 | } | 
|---|
| 380 | }; | 
|---|
| 381 | let file = file.map_err(|e| format!( "perf_event_open failed: {:?} ", e))?; | 
|---|
| 382 |  | 
|---|
| 383 | let mmap = unsafe { | 
|---|
| 384 | MmapOptions::new() | 
|---|
| 385 | .len(mem::size_of::<perf_event_mmap_page>()) | 
|---|
| 386 | .map(&file) | 
|---|
| 387 | }; | 
|---|
| 388 | let mmap = mmap.map_err(|e| format!( "perf_event_mmap_page: mmap failed: {:?} ", e))?; | 
|---|
| 389 |  | 
|---|
| 390 | let mut counter = Counter { mmap, reg_idx: 0 }; | 
|---|
| 391 |  | 
|---|
| 392 | let (version, compat_version, caps, index, pmc_width) = counter | 
|---|
| 393 | .access_mmap_page_with_seqlock(|mp| { | 
|---|
| 394 | ( | 
|---|
| 395 | mp.version, | 
|---|
| 396 | mp.compat_version, | 
|---|
| 397 | unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 }, | 
|---|
| 398 | mp.index, | 
|---|
| 399 | mp.pmc_width, | 
|---|
| 400 | ) | 
|---|
| 401 | }); | 
|---|
| 402 |  | 
|---|
| 403 | info!( | 
|---|
| 404 | "Counter::new: version={}  compat_version={}  index={:#x} ", | 
|---|
| 405 | version, compat_version, index, | 
|---|
| 406 | ); | 
|---|
| 407 |  | 
|---|
| 408 | if caps.cap_user_rdpmc() == 0 { | 
|---|
| 409 | return Err(format!( | 
|---|
| 410 | "perf_event_mmap_page: missing cap_user_rdpmc{} ", | 
|---|
| 411 | if caps.cap_bit0_is_deprecated() == 0 && caps.cap_bit0() == 1 { | 
|---|
| 412 | " (ignoring legacy/broken rdpmc support)" | 
|---|
| 413 | } else { | 
|---|
| 414 | "" | 
|---|
| 415 | } | 
|---|
| 416 | ) | 
|---|
| 417 | .into()); | 
|---|
| 418 | } | 
|---|
| 419 |  | 
|---|
| 420 | if index == 0 { | 
|---|
| 421 | return Err(format!( | 
|---|
| 422 | "perf_event_mmap_page: no allocated hardware register (ran out?)" | 
|---|
| 423 | ) | 
|---|
| 424 | .into()); | 
|---|
| 425 | } | 
|---|
| 426 | counter.reg_idx = index - 1; | 
|---|
| 427 |  | 
|---|
| 428 | if (cfg!(not(accurate_seqlock_rdpmc)) || true) && pmc_width != 48 { | 
|---|
| 429 | return Err(format!( | 
|---|
| 430 | "perf_event_mmap_page: {} -bit hardware counter found, only 48-bit supported", | 
|---|
| 431 | pmc_width | 
|---|
| 432 | ) | 
|---|
| 433 | .into()); | 
|---|
| 434 | } | 
|---|
| 435 |  | 
|---|
| 436 | Ok(counter) | 
|---|
| 437 | } | 
|---|
| 438 |  | 
|---|
| 439 | /// Try to access the mmap page, retrying the `attempt` closure as long | 
|---|
| 440 | /// as the "seqlock" sequence number changes (which indicates the kernel | 
|---|
| 441 | /// has updated one or more fields within the mmap page). | 
|---|
| 442 | #[ inline] | 
|---|
| 443 | fn access_mmap_page_with_seqlock<T>( | 
|---|
| 444 | &self, | 
|---|
| 445 | attempt: impl Fn(&perf_event_mmap_page) -> T, | 
|---|
| 446 | ) -> T { | 
|---|
| 447 | // FIXME(eddyb) it's probably UB to use regular reads, especially | 
|---|
| 448 | // from behind `&T`, with the only synchronization being barriers. | 
|---|
| 449 | // Probably needs atomic reads, and stronger ones at that, for the | 
|---|
| 450 | // `lock` field, than the fields (which would be `Relaxed`?). | 
|---|
| 451 | let mmap_page = unsafe { &*(self.mmap.as_ptr() as *const perf_event_mmap_page) }; | 
|---|
| 452 | let barrier = || std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire); | 
|---|
| 453 |  | 
|---|
| 454 | loop { | 
|---|
| 455 | // Grab the "seqlock" - the kernel will update this value when it | 
|---|
| 456 | // updates any of the other fields that may be read in `attempt`. | 
|---|
| 457 | let seq_lock = mmap_page.lock; | 
|---|
| 458 | barrier(); | 
|---|
| 459 |  | 
|---|
| 460 | let result = attempt(mmap_page); | 
|---|
| 461 |  | 
|---|
| 462 | // If nothing has changed, we're done. Otherwise, keep retrying. | 
|---|
| 463 | barrier(); | 
|---|
| 464 | if mmap_page.lock == seq_lock { | 
|---|
| 465 | return result; | 
|---|
| 466 | } | 
|---|
| 467 | } | 
|---|
| 468 | } | 
|---|
| 469 | } | 
|---|
| 470 |  | 
|---|
| 471 | impl super::HwCounterRead for Counter { | 
|---|
| 472 | type Output = u64; | 
|---|
| 473 |  | 
|---|
| 474 | #[ inline] | 
|---|
| 475 | fn read(&self) -> u64 { | 
|---|
| 476 | // HACK(eddyb) keep the accurate code around while not using it, | 
|---|
| 477 | // to minimize overhead without losing the more complex implementation. | 
|---|
| 478 | let (counter, offset, pmc_width) = if cfg!(accurate_seqlock_rdpmc) && false { | 
|---|
| 479 | self.access_mmap_page_with_seqlock(|mp| { | 
|---|
| 480 | let caps = unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 }; | 
|---|
| 481 | assert_ne!(caps.cap_user_rdpmc(), 0); | 
|---|
| 482 |  | 
|---|
| 483 | ( | 
|---|
| 484 | rdpmc(mp.index.checked_sub(1).unwrap()), | 
|---|
| 485 | mp.offset, | 
|---|
| 486 | mp.pmc_width, | 
|---|
| 487 | ) | 
|---|
| 488 | }) | 
|---|
| 489 | } else { | 
|---|
| 490 | (rdpmc(self.reg_idx), 0, 48) | 
|---|
| 491 | }; | 
|---|
| 492 |  | 
|---|
| 493 | let counter = offset + (counter as i64); | 
|---|
| 494 |  | 
|---|
| 495 | // Sign-extend the `pmc_width`-bit value to `i64`. | 
|---|
| 496 | (counter << (64 - pmc_width) >> (64 - pmc_width)) as u64 | 
|---|
| 497 | } | 
|---|
| 498 | } | 
|---|
| 499 |  | 
|---|
| 500 | impl super::HwCounterRead for (&Counter, &Counter) { | 
|---|
| 501 | type Output = (u64, u64); | 
|---|
| 502 |  | 
|---|
| 503 | #[ inline] | 
|---|
| 504 | fn read(&self) -> (u64, u64) { | 
|---|
| 505 | // HACK(eddyb) keep the accurate code around while not using it, | 
|---|
| 506 | // to minimize overhead without losing the more complex implementation. | 
|---|
| 507 | if (cfg!(accurate_seqlock_rdpmc) || cfg!(unserialized_rdpmc)) && false { | 
|---|
| 508 | return (self.0.read(), self.1.read()); | 
|---|
| 509 | } | 
|---|
| 510 |  | 
|---|
| 511 | let pmc_width = 48; | 
|---|
| 512 |  | 
|---|
| 513 | let (a_counter, b_counter) = rdpmc_pair(self.0.reg_idx, self.1.reg_idx); | 
|---|
| 514 |  | 
|---|
| 515 | // Sign-extend the `pmc_width`-bit values to `i64`. | 
|---|
| 516 | ( | 
|---|
| 517 | ((a_counter as i64) << (64 - pmc_width) >> (64 - pmc_width)) as u64, | 
|---|
| 518 | ((b_counter as i64) << (64 - pmc_width) >> (64 - pmc_width)) as u64, | 
|---|
| 519 | ) | 
|---|
| 520 | } | 
|---|
| 521 | } | 
|---|
| 522 |  | 
|---|
| 523 | /// Read the hardware performance counter indicated by `reg_idx`. | 
|---|
| 524 | /// | 
|---|
| 525 | /// If the counter is signed, sign extension should be performed based on | 
|---|
| 526 | /// the width of the register (32 to 64 bits, e.g. 48-bit seems common). | 
|---|
| 527 | #[ inline(always)] | 
|---|
| 528 | fn rdpmc(reg_idx: u32) -> u64 { | 
|---|
| 529 | // NOTE(eddyb) below comment is outdated (the other branch uses `cpuid`). | 
|---|
| 530 | if cfg!(unserialized_rdpmc) && false { | 
|---|
| 531 | // FIXME(eddyb) the Intel and AMD manuals warn about the need for | 
|---|
| 532 | // "serializing instructions" before/after `rdpmc`, if avoiding any | 
|---|
| 533 | // reordering is desired, but do not agree on the full set of usable | 
|---|
| 534 | // "serializing instructions" (e.g. `mfence` isn't listed in both). | 
|---|
| 535 | // | 
|---|
| 536 | // The only usable, and guaranteed to work, "serializing instruction" | 
|---|
| 537 | // appears to be `cpuid`, but it doesn't seem easy to use, especially | 
|---|
| 538 | // due to the overlap in registers with `rdpmc` itself, and it might | 
|---|
| 539 | // have too high of a cost, compared to serialization benefits (if any). | 
|---|
| 540 | unserialized_rdpmc(reg_idx) | 
|---|
| 541 | } else { | 
|---|
| 542 | serialize_instruction_execution(); | 
|---|
| 543 | unserialized_rdpmc(reg_idx) | 
|---|
| 544 | } | 
|---|
| 545 | } | 
|---|
| 546 |  | 
|---|
| 547 | /// Read two hardware performance counters at once (see `rdpmc`). | 
|---|
| 548 | /// | 
|---|
| 549 | /// Should be more efficient/accurate than two `rdpmc` calls, as it | 
|---|
| 550 | /// only requires one "serializing instruction", rather than two. | 
|---|
| 551 | #[ inline(always)] | 
|---|
| 552 | fn rdpmc_pair(a_reg_idx: u32, b_reg_idx: u32) -> (u64, u64) { | 
|---|
| 553 | serialize_instruction_execution(); | 
|---|
| 554 | (unserialized_rdpmc(a_reg_idx), unserialized_rdpmc(b_reg_idx)) | 
|---|
| 555 | } | 
|---|
| 556 |  | 
|---|
| 557 | /// Dummy `cpuid(0)` to serialize instruction execution. | 
|---|
| 558 | #[ inline(always)] | 
|---|
| 559 | fn serialize_instruction_execution() { | 
|---|
| 560 | unsafe { | 
|---|
| 561 | asm!( | 
|---|
| 562 | "xor %eax, %eax", // Intel syntax: "xor eax, eax" | 
|---|
| 563 | // LLVM sometimes reserves `ebx` for its internal use, so we need to use | 
|---|
| 564 | // a scratch register for it instead. | 
|---|
| 565 | "mov %rbx, {tmp_rbx :r}", // Intel syntax: "mov {tmp_rbx:r}, rbx" | 
|---|
| 566 | "cpuid", | 
|---|
| 567 | "mov {tmp_rbx :r}, %rbx", // Intel syntax: "mov rbx, {tmp_rbx:r}" | 
|---|
| 568 | tmp_rbx = lateout(reg) _, | 
|---|
| 569 | // `cpuid` clobbers. | 
|---|
| 570 | lateout( "eax") _, | 
|---|
| 571 | lateout( "edx") _, | 
|---|
| 572 | lateout( "ecx") _, | 
|---|
| 573 |  | 
|---|
| 574 | options(nostack), | 
|---|
| 575 | // Older versions of LLVM do not support modifiers in | 
|---|
| 576 | // Intel syntax inline asm; whenever Rust minimum LLVM version | 
|---|
| 577 | // supports Intel syntax inline asm, remove and replace above | 
|---|
| 578 | // instructions with Intel syntax version (from comments). | 
|---|
| 579 | options(att_syntax), | 
|---|
| 580 | ); | 
|---|
| 581 | } | 
|---|
| 582 | } | 
|---|
| 583 |  | 
|---|
| 584 | /// Read the hardware performance counter indicated by `reg_idx`. | 
|---|
| 585 | /// | 
|---|
| 586 | /// If the counter is signed, sign extension should be performed based on | 
|---|
| 587 | /// the width of the register (32 to 64 bits, e.g. 48-bit seems common). | 
|---|
| 588 | #[ inline(always)] | 
|---|
| 589 | fn unserialized_rdpmc(reg_idx: u32) -> u64 { | 
|---|
| 590 | let (lo, hi): (u32, u32); | 
|---|
| 591 | unsafe { | 
|---|
| 592 | asm!( | 
|---|
| 593 | "rdpmc", | 
|---|
| 594 | in( "ecx") reg_idx, | 
|---|
| 595 | lateout( "eax") lo, | 
|---|
| 596 | lateout( "edx") hi, | 
|---|
| 597 | options(nostack), | 
|---|
| 598 | // Older versions of LLVM do not support modifiers in | 
|---|
| 599 | // Intel syntax inline asm; whenever Rust minimum LLVM version | 
|---|
| 600 | // supports Intel syntax inline asm, remove and replace above | 
|---|
| 601 | // instructions with Intel syntax version (from comments). | 
|---|
| 602 | options(att_syntax), | 
|---|
| 603 | ); | 
|---|
| 604 | } | 
|---|
| 605 | lo as u64 | (hi as u64) << 32 | 
|---|
| 606 | } | 
|---|
| 607 |  | 
|---|
| 608 | /// Categorization of `x86_64` CPUs, primarily based on how they | 
|---|
| 609 | /// support for counting "hardware interrupts" (documented or not). | 
|---|
| 610 | pub(super) enum CpuModel { | 
|---|
| 611 | Amd(AmdGen), | 
|---|
| 612 | Intel(IntelGen), | 
|---|
| 613 | } | 
|---|
| 614 |  | 
|---|
| 615 | pub(super) enum AmdGen { | 
|---|
| 616 | /// K8 (Hammer) to Jaguar / Puma. | 
|---|
| 617 | PreZen, | 
|---|
| 618 |  | 
|---|
| 619 | /// Zen / Zen+ / Zen 2. | 
|---|
| 620 | Zen, | 
|---|
| 621 |  | 
|---|
| 622 | /// Unknown AMD CPU, contemporary to/succeeding Zen/Zen+/Zen 2, | 
|---|
| 623 | /// but likely similar to them. | 
|---|
| 624 | UnknownMaybeZenLike, | 
|---|
| 625 | } | 
|---|
| 626 |  | 
|---|
| 627 | pub(super) enum IntelGen { | 
|---|
| 628 | /// Intel CPU predating Sandy Bridge. These are the only CPUs we | 
|---|
| 629 | /// can't support (more) accurate instruction counting on, as they | 
|---|
| 630 | /// don't (appear to) have any way to count "hardware interrupts". | 
|---|
| 631 | PreBridge, | 
|---|
| 632 |  | 
|---|
| 633 | /// Sandy Bridge / Ivy Bridge: | 
|---|
| 634 | /// * client: Sandy Bridge (M/H) / Ivy Bridge (M/H/Gladden) | 
|---|
| 635 | /// * server: Sandy Bridge (E/EN/EP) / Ivy Bridge (E/EN/EP/EX) | 
|---|
| 636 | /// | 
|---|
| 637 | /// Intel doesn't document support for counting "hardware interrupts" | 
|---|
| 638 | /// prior to Skylake, but testing found that `HW_INTERRUPTS.RECEIVED` | 
|---|
| 639 | /// from Skylake has existed, with the same config, as far back as | 
|---|
| 640 | /// "Sandy Bridge" (but before that it mapped to a different event). | 
|---|
| 641 | /// | 
|---|
| 642 | /// These are the (pre-Skylake) *Bridge CPU models confirmed so far: | 
|---|
| 643 | /// * Sandy Bridge (client) Family 6 Model 42 | 
|---|
| 644 | ///     Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz (@alyssais) | 
|---|
| 645 | /// * Ivy Bridge (client) Family 6 Model 58 | 
|---|
| 646 | ///     Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz (@eddyb) | 
|---|
| 647 | /// | 
|---|
| 648 | /// We later found this paper, which on page 5 lists 12 counters, | 
|---|
| 649 | /// for each of Nehalem/Westmere, Sandy Bridge and Ivy Bridge: | 
|---|
| 650 | /// http://web.eece.maine.edu/~vweaver/projects/deterministic/deterministic_counters.pdf | 
|---|
| 651 | /// It appears that both Sandy Bridge and Ivy Bridge used to have | 
|---|
| 652 | /// `HW_INTERRUPTS.RECEIVED` documented, before Intel removed every | 
|---|
| 653 | /// mention of the counter from newer versions of their manuals. | 
|---|
| 654 | Bridge, | 
|---|
| 655 |  | 
|---|
| 656 | /// Haswell / Broadwell: | 
|---|
| 657 | /// * client: Haswell (S/ULT/GT3e) / Broadwell (U/Y/S/H/C/W) | 
|---|
| 658 | /// * server: Haswell (E/EP/EX) / Broadwell (E/EP/EX/DE/Hewitt Lake) | 
|---|
| 659 | /// | 
|---|
| 660 | /// Equally as undocumented as "Sandy Bridge / Ivy Bridge" (see above). | 
|---|
| 661 | /// | 
|---|
| 662 | /// These are the (pre-Skylake) *Well CPU models confirmed so far: | 
|---|
| 663 | /// * Haswell (client) Family 6 Model 60 | 
|---|
| 664 | ///     Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz (@m-ou-se) | 
|---|
| 665 | /// * Haswell (server) Family 6 Model 63 | 
|---|
| 666 | ///     Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz (@cuviper) | 
|---|
| 667 | /// * Haswell (client + GT3e) Family 6 Model 70 | 
|---|
| 668 | ///     Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz (@nagisa) | 
|---|
| 669 | ///     Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz (@m-ou-se) | 
|---|
| 670 | Well, | 
|---|
| 671 |  | 
|---|
| 672 | /// Skylake / Skylake-derived: | 
|---|
| 673 | /// * client: Skylake (Y/U/DT/H/S) / Kaby Lake (Y/U/DT/H/S/X) / Coffee Lake (U/S/H/E) | 
|---|
| 674 | /// * server: Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W) | 
|---|
| 675 | /// | 
|---|
| 676 | /// Both "client" and "server" product lines have documented support | 
|---|
| 677 | /// for counting "hardware interrupts" (`HW_INTERRUPTS.RECEIVED`). | 
|---|
| 678 | /// | 
|---|
| 679 | /// Intel does not make it clear that future product lines, such as | 
|---|
| 680 | /// "Ice Lake", will continue to support this (or with what config), | 
|---|
| 681 | /// and even "Comet Lake" (aka "10th gen") isn't explicitly listed. | 
|---|
| 682 | Lake, | 
|---|
| 683 |  | 
|---|
| 684 | /// Unknown Intel CPU, contemporary to/succeeding *Bridge/*Well/*Lake, | 
|---|
| 685 | /// but likely similar to them. | 
|---|
| 686 | UnknownMaybeLakeLike, | 
|---|
| 687 | } | 
|---|
| 688 |  | 
|---|
| 689 | impl CpuModel { | 
|---|
| 690 | /// Detect the model of the current CPU using `cpuid`. | 
|---|
| 691 | pub(super) fn detect() -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 692 | let cpuid0 = unsafe { std::arch::x86_64::__cpuid(0) }; | 
|---|
| 693 | let cpuid1 = unsafe { std::arch::x86_64::__cpuid(1) }; | 
|---|
| 694 | let mut vendor = [0; 12]; | 
|---|
| 695 | vendor[0..4].copy_from_slice(&cpuid0.ebx.to_le_bytes()); | 
|---|
| 696 | vendor[4..8].copy_from_slice(&cpuid0.edx.to_le_bytes()); | 
|---|
| 697 | vendor[8..12].copy_from_slice(&cpuid0.ecx.to_le_bytes()); | 
|---|
| 698 |  | 
|---|
| 699 | let vendor = std::str::from_utf8(&vendor).map_err(|_| { | 
|---|
| 700 | format!( | 
|---|
| 701 | "cpuid returned non-UTF-8 vendor name: cpuid(0)={:?}  cpuid(1)={:?} ", | 
|---|
| 702 | cpuid0, cpuid1 | 
|---|
| 703 | ) | 
|---|
| 704 | })?; | 
|---|
| 705 |  | 
|---|
| 706 | let version = cpuid1.eax; | 
|---|
| 707 |  | 
|---|
| 708 | let mut family = (version >> 8) & 0xf; | 
|---|
| 709 | if family == 15 { | 
|---|
| 710 | // Extended family. | 
|---|
| 711 | family += (version >> 20) & 0xff; | 
|---|
| 712 | } | 
|---|
| 713 |  | 
|---|
| 714 | let mut model = (version >> 4) & 0xf; | 
|---|
| 715 | if family >= 15 || vendor == "GenuineIntel"&& family == 6 { | 
|---|
| 716 | // Extended model. | 
|---|
| 717 | model += ((version >> 16) & 0xf) << 4; | 
|---|
| 718 | } | 
|---|
| 719 |  | 
|---|
| 720 | info!( | 
|---|
| 721 | "CpuModel::detect: vendor={:?}  family={}  model={} ", | 
|---|
| 722 | vendor, family, model | 
|---|
| 723 | ); | 
|---|
| 724 |  | 
|---|
| 725 | match vendor { | 
|---|
| 726 | "AuthenticAMD"=> { | 
|---|
| 727 | use self::AmdGen::*; | 
|---|
| 728 |  | 
|---|
| 729 | let (gen, name) = match (family, model) { | 
|---|
| 730 | (0..=14, _) | (19, _) => { | 
|---|
| 731 | return Err(format!( | 
|---|
| 732 | "impossible AMD64 CPU detected (Family {}  Model {} ); {} ", | 
|---|
| 733 | family, | 
|---|
| 734 | model, | 
|---|
| 735 | super::BUG_REPORT_MSG | 
|---|
| 736 | ) | 
|---|
| 737 | .into()); | 
|---|
| 738 | } | 
|---|
| 739 |  | 
|---|
| 740 | (15, _) => (PreZen, "K8 (Hammer)"), | 
|---|
| 741 | (16, _) => (PreZen, "K10 (Barcelona/Shanghai/Istanbul)"), | 
|---|
| 742 | (17, _) => (PreZen, "K8+K10 hybrid (Turion X2 Ultra)"), | 
|---|
| 743 | (18, _) => (PreZen, "Fusion"), | 
|---|
| 744 | (20, _) => (PreZen, "Bobcat"), | 
|---|
| 745 | (21, _) => (PreZen, "Bulldozer / Piledriver / Steamroller / Excavator"), | 
|---|
| 746 | (22, _) => (PreZen, "Jaguar / Puma"), | 
|---|
| 747 |  | 
|---|
| 748 | (23, 1) => (Zen, "Zen (Naples/Whitehaven/Summit Ridge/Snowy Owl)"), | 
|---|
| 749 | (23, 17) => (Zen, "Zen (Raven Ridge)"), | 
|---|
| 750 | (23, 24) => (Zen, "Zen (Banded Kestrel/Dali) / Zen+ (Picasso)"), | 
|---|
| 751 | (23, 8) => (Zen, "Zen+ (Pinnacle Ridge)"), | 
|---|
| 752 | (23, 49) => (Zen, "Zen 2 (Rome/Castle Peak)"), | 
|---|
| 753 | (23, 113) => (Zen, "Zen 2 (Matisse)"), | 
|---|
| 754 |  | 
|---|
| 755 | (23..=0xffff_ffff, _) => { | 
|---|
| 756 | really_warn!( | 
|---|
| 757 | "CpuModel::detect: unknown AMD CPU (Family {} Model {}), \ | 
|---|
| 758 |                                  assuming Zen-like; {}", | 
|---|
| 759 | family, | 
|---|
| 760 | model, | 
|---|
| 761 | super::BUG_REPORT_MSG | 
|---|
| 762 | ); | 
|---|
| 763 |  | 
|---|
| 764 | (UnknownMaybeZenLike, "") | 
|---|
| 765 | } | 
|---|
| 766 | }; | 
|---|
| 767 |  | 
|---|
| 768 | if !name.is_empty() { | 
|---|
| 769 | info!( "CpuModel::detect: known AMD CPU: {} ", name); | 
|---|
| 770 | } | 
|---|
| 771 |  | 
|---|
| 772 | // The `SpecLockMap` (speculative atomic aka `lock` instruction | 
|---|
| 773 | // execution, unclear what "Map" refers to) feature in AMD Zen CPUs | 
|---|
| 774 | // causes non-deterministic overcounting of atomic instructions, | 
|---|
| 775 | // presumably whenever it has to roll back the speculation | 
|---|
| 776 | // (as in, the performance counters aren't rolled back). | 
|---|
| 777 | // Even this this may be rare when uncontended, it adds up. | 
|---|
| 778 | // | 
|---|
| 779 | // There is an MSR bit (`MSRC001_1020[54]`) that's not officially | 
|---|
| 780 | // documented, but which several motherboards and profiling tools | 
|---|
| 781 | // set whenever IBS (Instruction-Based Sampling) is in use, and | 
|---|
| 782 | // it is sometimes referred to as "disabling `SpecLockMap`" | 
|---|
| 783 | // (hence having a name for the feature that speculates `lock`s). | 
|---|
| 784 | // | 
|---|
| 785 | // One way we could detect that the bit has been set would be to | 
|---|
| 786 | // parse `uname().release` (aka `uname -r`) and look for versions | 
|---|
| 787 | // which are known to include the patch suggested in this thread: | 
|---|
| 788 | // https://github.com/mozilla/rr/issues/2034#issuecomment-693761247 | 
|---|
| 789 | // | 
|---|
| 790 | // However, one may set the bit using e.g. `wrmsr`, even on older | 
|---|
| 791 | // kernels, so a more reliable approach is to execute some atomics | 
|---|
| 792 | // and look at the `SpecLockMapCommit` (`r0825:u`) Zen counter, | 
|---|
| 793 | // which only reliably remains `0` when `SpecLockMap` is disabled. | 
|---|
| 794 | if matches!(gen, Zen | UnknownMaybeZenLike) { | 
|---|
| 795 | if let Ok(spec_lock_map_commit) = | 
|---|
| 796 | Counter::with_type_and_hw_id(PERF_TYPE_RAW, 0x08_25) | 
|---|
| 797 | { | 
|---|
| 798 | use super::HwCounterRead; | 
|---|
| 799 |  | 
|---|
| 800 | let start_spec_lock_map_commit = spec_lock_map_commit.read(); | 
|---|
| 801 |  | 
|---|
| 802 | // Execute an atomic (`lock`) instruction, which should | 
|---|
| 803 | // start speculative execution for following instructions | 
|---|
| 804 | // (as long as `SpecLockMap` isn't disabled). | 
|---|
| 805 | let mut atomic: u64 = 0; | 
|---|
| 806 | let mut _tmp: u64 = 0; | 
|---|
| 807 | unsafe { | 
|---|
| 808 | asm!( | 
|---|
| 809 | // Intel syntax: "lock xadd [{atomic}], {tmp}" | 
|---|
| 810 | "lock xadd {tmp }, ({atomic })", | 
|---|
| 811 |  | 
|---|
| 812 | atomic = in(reg) &mut atomic, | 
|---|
| 813 | tmp = inout(reg) _tmp, | 
|---|
| 814 |  | 
|---|
| 815 | // Older versions of LLVM do not support modifiers in | 
|---|
| 816 | // Intel syntax inline asm; whenever Rust minimum LLVM | 
|---|
| 817 | // version supports Intel syntax inline asm, remove | 
|---|
| 818 | // and replace above instructions with Intel syntax | 
|---|
| 819 | // version (from comments). | 
|---|
| 820 | options(att_syntax), | 
|---|
| 821 | ); | 
|---|
| 822 | } | 
|---|
| 823 |  | 
|---|
| 824 | if spec_lock_map_commit.read() != start_spec_lock_map_commit { | 
|---|
| 825 | really_warn!( | 
|---|
| 826 | "CpuModel::detect: SpecLockMap detected, in AMD {} CPU; \ | 
|---|
| 827 |                                      this may add some non-deterministic noise - \ | 
|---|
| 828 |                                      for information on disabling SpecLockMap, see \ | 
|---|
| 829 |                                      https://github.com/mozilla/rr/wiki/Zen", | 
|---|
| 830 | name | 
|---|
| 831 | ); | 
|---|
| 832 | } | 
|---|
| 833 | } | 
|---|
| 834 | } | 
|---|
| 835 |  | 
|---|
| 836 | Ok(CpuModel::Amd(gen)) | 
|---|
| 837 | } | 
|---|
| 838 |  | 
|---|
| 839 | "GenuineIntel"=> { | 
|---|
| 840 | use self::IntelGen::*; | 
|---|
| 841 |  | 
|---|
| 842 | let (gen, name) = match (family, model) { | 
|---|
| 843 | // No need to name these, they're unsupported anyway. | 
|---|
| 844 | (0..=5, _) => (PreBridge, ""), | 
|---|
| 845 | (15, _) => (PreBridge, "Netburst"), | 
|---|
| 846 | (6, 0..=41) => (PreBridge, ""), | 
|---|
| 847 |  | 
|---|
| 848 | // Older Xeon Phi CPUs, misplaced in Family 6. | 
|---|
| 849 | (6, 87) => (PreBridge, "Knights Landing"), | 
|---|
| 850 | (6, 133) => (PreBridge, "Knights Mill"), | 
|---|
| 851 |  | 
|---|
| 852 | // Older Atom CPUs, interleaved with other CPUs. | 
|---|
| 853 | // FIXME(eddyb) figure out if these are like *Bridge/*Well. | 
|---|
| 854 | (6, 53) | (6, 54) => (PreBridge, "Saltwell"), | 
|---|
| 855 | (6, 55) | (6, 74) | (6, 77) | (6, 90) | (6, 93) => { | 
|---|
| 856 | (PreBridge, "Silvermont") | 
|---|
| 857 | } | 
|---|
| 858 | (6, 76) => (PreBridge, "Airmont (Cherry Trail/Braswell)"), | 
|---|
| 859 |  | 
|---|
| 860 | // Older server CPUs, numbered out of order. | 
|---|
| 861 | (6, 44) => (PreBridge, "Westmere (Gulftown/EP)"), | 
|---|
| 862 | (6, 46) => (PreBridge, "Nehalem (EX)"), | 
|---|
| 863 | (6, 47) => (PreBridge, "Westmere (EX)"), | 
|---|
| 864 |  | 
|---|
| 865 | (6, 42) => (Bridge, "Sandy Bridge (M/H)"), | 
|---|
| 866 | (6, 45) => (Bridge, "Sandy Bridge (E/EN/EP)"), | 
|---|
| 867 | (6, 58) => (Bridge, "Ivy Bridge (M/H/Gladden)"), | 
|---|
| 868 | (6, 62) => (Bridge, "Ivy Bridge (E/EN/EP/EX)"), | 
|---|
| 869 |  | 
|---|
| 870 | (6, 60) => (Well, "Haswell (S)"), | 
|---|
| 871 | (6, 61) => (Well, "Broadwell (U/Y/S)"), | 
|---|
| 872 | (6, 63) => (Well, "Haswell (E/EP/EX)"), | 
|---|
| 873 | (6, 69) => (Well, "Haswell (ULT)"), | 
|---|
| 874 | (6, 70) => (Well, "Haswell (GT3e)"), | 
|---|
| 875 | (6, 71) => (Well, "Broadwell (H/C/W)"), | 
|---|
| 876 | (6, 79) => (Well, "Broadwell (E/EP/EX)"), | 
|---|
| 877 | (6, 86) => (Well, "Broadwell (DE/Hewitt Lake)"), | 
|---|
| 878 |  | 
|---|
| 879 | (6, 78) => (Lake, "Skylake (Y/U)"), | 
|---|
| 880 | (6, 85) => (Lake, "Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W)"), | 
|---|
| 881 | (6, 94) => (Lake, "Skylake (DT/H/S)"), | 
|---|
| 882 | (6, 142) => (Lake, "Kaby Lake (Y/U) / Coffee Lake (U)"), | 
|---|
| 883 | (6, 158) => (Lake, "Kaby Lake (DT/H/S/X) / Coffee Lake (S/H/E)"), | 
|---|
| 884 |  | 
|---|
| 885 | (6..=14, _) | (16..=0xffff_ffff, _) => { | 
|---|
| 886 | really_warn!( | 
|---|
| 887 | "CpuModel::detect: unknown Intel CPU (Family {} Model {}), \ | 
|---|
| 888 |                                  assuming Skylake-like; {}", | 
|---|
| 889 | family, | 
|---|
| 890 | model, | 
|---|
| 891 | super::BUG_REPORT_MSG | 
|---|
| 892 | ); | 
|---|
| 893 |  | 
|---|
| 894 | (UnknownMaybeLakeLike, "") | 
|---|
| 895 | } | 
|---|
| 896 | }; | 
|---|
| 897 |  | 
|---|
| 898 | if !name.is_empty() { | 
|---|
| 899 | info!( "CpuModel::detect: known Intel CPU: {} ", name); | 
|---|
| 900 | } | 
|---|
| 901 |  | 
|---|
| 902 | Ok(CpuModel::Intel(gen)) | 
|---|
| 903 | } | 
|---|
| 904 |  | 
|---|
| 905 | _ => Err(format!( | 
|---|
| 906 | "cpuid returned unknown CPU vendor {:?} ; version={:#x} ", | 
|---|
| 907 | vendor, version | 
|---|
| 908 | ) | 
|---|
| 909 | .into()), | 
|---|
| 910 | } | 
|---|
| 911 | } | 
|---|
| 912 |  | 
|---|
| 913 | /// Return the hardware performance counter configuration for | 
|---|
| 914 | /// counting "hardware interrupts" (documented or not). | 
|---|
| 915 | fn irqs_counter_config(&self) -> Result<u32, Box<dyn Error + Send + Sync>> { | 
|---|
| 916 | match self { | 
|---|
| 917 | CpuModel::Amd(model) => match model { | 
|---|
| 918 | AmdGen::PreZen => Ok(0x00_cf), | 
|---|
| 919 | AmdGen::Zen | AmdGen::UnknownMaybeZenLike => Ok(0x00_2c), | 
|---|
| 920 | }, | 
|---|
| 921 | CpuModel::Intel(model) => match model { | 
|---|
| 922 | IntelGen::PreBridge => Err(format!( | 
|---|
| 923 | "counting IRQs not yet supported on Intel CPUs \ | 
|---|
| 924 |                          predating Sandy Bridge; {} ", | 
|---|
| 925 | super::BUG_REPORT_MSG | 
|---|
| 926 | ) | 
|---|
| 927 | .into()), | 
|---|
| 928 | IntelGen::Bridge | 
|---|
| 929 | | IntelGen::Well | 
|---|
| 930 | | IntelGen::Lake | 
|---|
| 931 | | IntelGen::UnknownMaybeLakeLike => Ok(0x01_cb), | 
|---|
| 932 | }, | 
|---|
| 933 | } | 
|---|
| 934 | } | 
|---|
| 935 | } | 
|---|
| 936 | } | 
|---|
| 937 |  | 
|---|
| 938 | #[ cfg(not(all(target_arch = "x86_64", target_os = "linux")))] | 
|---|
| 939 | mod hw { | 
|---|
| 940 | use std::error::Error; | 
|---|
| 941 |  | 
|---|
| 942 | pub(super) enum Counter {} | 
|---|
| 943 |  | 
|---|
| 944 | impl Counter { | 
|---|
| 945 | pub(super) fn new( | 
|---|
| 946 | model: &CpuModel, | 
|---|
| 947 | _: super::HwCounterType, | 
|---|
| 948 | ) -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 949 | match *model {} | 
|---|
| 950 | } | 
|---|
| 951 | } | 
|---|
| 952 |  | 
|---|
| 953 | impl super::HwCounterRead for Counter { | 
|---|
| 954 | type Output = u64; | 
|---|
| 955 |  | 
|---|
| 956 | #[ inline] | 
|---|
| 957 | fn read(&self) -> u64 { | 
|---|
| 958 | match *self {} | 
|---|
| 959 | } | 
|---|
| 960 | } | 
|---|
| 961 |  | 
|---|
| 962 | impl super::HwCounterRead for (&Counter, &Counter) { | 
|---|
| 963 | type Output = (u64, u64); | 
|---|
| 964 |  | 
|---|
| 965 | #[ inline] | 
|---|
| 966 | fn read(&self) -> (u64, u64) { | 
|---|
| 967 | match *self.0 {} | 
|---|
| 968 | } | 
|---|
| 969 | } | 
|---|
| 970 |  | 
|---|
| 971 | pub(super) enum CpuModel {} | 
|---|
| 972 |  | 
|---|
| 973 | impl CpuModel { | 
|---|
| 974 | pub(super) fn detect() -> Result<Self, Box<dyn Error + Send + Sync>> { | 
|---|
| 975 | // HACK(eddyb) mark `really_warn!` (and transitively `log` macros) | 
|---|
| 976 | // and `BUG_REPORT_MSG` as "used" to silence warnings. | 
|---|
| 977 | if false { | 
|---|
| 978 | really_warn!( "unsupported; {}", super::BUG_REPORT_MSG); | 
|---|
| 979 | } | 
|---|
| 980 |  | 
|---|
| 981 | let mut msg = String::new(); | 
|---|
| 982 | let mut add_error = |s| { | 
|---|
| 983 | if !msg.is_empty() { | 
|---|
| 984 | msg += "; "; | 
|---|
| 985 | } | 
|---|
| 986 | msg += s; | 
|---|
| 987 | }; | 
|---|
| 988 |  | 
|---|
| 989 | if cfg!(not(target_arch = "x86_64")) { | 
|---|
| 990 | add_error( "only supported architecture is x86_64"); | 
|---|
| 991 | } | 
|---|
| 992 |  | 
|---|
| 993 | if cfg!(not(target_os = "linux")) { | 
|---|
| 994 | add_error( "only supported OS is Linux"); | 
|---|
| 995 | } | 
|---|
| 996 |  | 
|---|
| 997 | Err(msg.into()) | 
|---|
| 998 | } | 
|---|
| 999 | } | 
|---|
| 1000 | } | 
|---|
| 1001 |  | 
|---|