diff --git a/docs/cli_help.md b/docs/cli_help.md index d6af964..336a826 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -134,7 +134,7 @@ before any `--benchmark` arguments. * `--throttle-threshold ` - * Set the GPU throttle threshold as percentage of the peak clock rate. + * Set the GPU throttle threshold as percentage of the device's default clock rate. * Default is 75%. * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. diff --git a/examples/summaries.cu b/examples/summaries.cu index ad4fc06..0bb91ae 100644 --- a/examples/summaries.cu +++ b/examples/summaries.cu @@ -41,7 +41,7 @@ void summary_example(nvbench::state &state) } // Run the measurements: - state.exec([duration](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [duration](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration); }); @@ -56,14 +56,17 @@ void summary_example(nvbench::state &state) #endif // Default summary columns can be shown/hidden in the markdown output tables by adding/removing - // the "hide" key. Modify this benchmark to show the minimum and maximum times, but hide the - // means. + // the "hide" key. Modify this benchmark to show the minimum and maximum GPUs times, but hide the + // mean GPU time and all CPU times. SM Clock frequency and throttling info are also shown. state.get_summary("nv/cold/time/gpu/min").remove_value("hide"); state.get_summary("nv/cold/time/gpu/max").remove_value("hide"); state.get_summary("nv/cold/time/gpu/mean").set_string("hide", ""); - state.get_summary("nv/cold/time/cpu/min").remove_value("hide"); - state.get_summary("nv/cold/time/cpu/max").remove_value("hide"); state.get_summary("nv/cold/time/cpu/mean").set_string("hide", ""); + state.get_summary("nv/cold/time/cpu/min").set_string("hide", ""); + state.get_summary("nv/cold/time/cpu/max").set_string("hide", ""); + state.get_summary("nv/cold/time/cpu/stdev/relative").set_string("hide", ""); + state.get_summary("nv/cold/sm_clock_rate/mean").remove_value("hide"); + state.get_summary("nv/cold/sm_clock_rate/scaling/percent").remove_value("hide"); } NVBENCH_BENCH(summary_example) .add_int64_axis("ms", nvbench::range(10, 50, 20)) diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index d35779c..58c55b6 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -302,7 +302,7 @@ protected: nvbench::float64_t m_skip_time{-1.}; nvbench::float64_t m_timeout{15.}; - nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_threshold{0.75f}; // [% of default SM clock rate] nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds] nvbench::criterion_params m_criterion_params; diff --git a/nvbench/detail/gpu_frequency.cuh b/nvbench/detail/gpu_frequency.cuh index 320d3d2..dc0dcd7 100644 --- a/nvbench/detail/gpu_frequency.cuh +++ b/nvbench/detail/gpu_frequency.cuh @@ -40,7 +40,7 @@ struct gpu_frequency void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); } - [[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz, + [[nodiscard]] bool has_throttled(nvbench::float32_t default_sm_clock_rate_hz, nvbench::float32_t throttle_threshold); [[nodiscard]] nvbench::float32_t get_clock_frequency(); diff --git a/nvbench/detail/gpu_frequency.cxx b/nvbench/detail/gpu_frequency.cxx index 8f2d19b..adc1011 100644 --- a/nvbench/detail/gpu_frequency.cxx +++ b/nvbench/detail/gpu_frequency.cxx @@ -31,10 +31,10 @@ nvbench::float32_t gpu_frequency::get_clock_frequency() return clock_rate; } -bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz, +bool gpu_frequency::has_throttled(nvbench::float32_t default_sm_clock_rate_hz, nvbench::float32_t throttle_threshold) { - float threshold = peak_sm_clock_rate_hz * throttle_threshold; + float threshold = default_sm_clock_rate_hz * throttle_threshold; if (this->get_clock_frequency() < threshold) { diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 2bbf0e0..a602062 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -53,7 +53,6 @@ measure_cold_base::measure_cold_base(state &exec_state) { m_cuda_times.reserve(static_cast(m_min_samples)); m_cpu_times.reserve(static_cast(m_min_samples)); - m_sm_clock_rates.reserve(static_cast(m_min_samples)); } } @@ -72,18 +71,18 @@ void measure_cold_base::check() void measure_cold_base::initialize() { - m_min_cuda_time = std::numeric_limits::max(); - m_max_cuda_time = std::numeric_limits::lowest(); - m_total_cuda_time = 0.; - m_min_cpu_time = std::numeric_limits::max(); - m_max_cpu_time = std::numeric_limits::lowest(); - m_total_cpu_time = 0.; - m_total_samples = 0; - m_max_time_exceeded = false; + m_min_cuda_time = std::numeric_limits::max(); + m_max_cuda_time = std::numeric_limits::lowest(); + m_total_cuda_time = 0.; + m_min_cpu_time = std::numeric_limits::max(); + m_max_cpu_time = std::numeric_limits::lowest(); + m_total_cpu_time = 0.; + m_sm_clock_rate_accumulator = 0.; + m_total_samples = 0; + m_max_time_exceeded = false; m_cuda_times.clear(); m_cpu_times.clear(); - m_sm_clock_rates.clear(); m_stopping_criterion.initialize(m_criterion_params); } @@ -94,21 +93,22 @@ void measure_cold_base::record_measurements() { if (!m_run_once) { - auto peak_clock_rate = static_cast(m_state.get_device()->get_sm_default_clock_rate()); + const auto current_clock_rate = m_gpu_frequency.get_clock_frequency(); + const auto default_clock_rate = + static_cast(m_state.get_device()->get_sm_default_clock_rate()); - if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold)) + if (m_gpu_frequency.has_throttled(default_clock_rate, m_throttle_threshold)) { if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value()) { - auto current_clock_rate = m_gpu_frequency.get_clock_frequency(); - auto &printer = printer_opt_ref.value().get(); + auto &printer = printer_opt_ref.value().get(); printer.log(nvbench::log_level::warn, fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) " "({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample " "and pausing for {}s.", current_clock_rate / 1000000.0f, - peak_clock_rate / 1000000.0f, - 100.0f * (current_clock_rate / peak_clock_rate), + default_clock_rate / 1000000.0f, + 100.0f * (current_clock_rate / default_clock_rate), 100.0f * m_throttle_threshold, m_total_samples, m_throttle_recovery_delay)); @@ -123,7 +123,7 @@ void measure_cold_base::record_measurements() return; } - m_sm_clock_rates.push_back(peak_clock_rate); + m_sm_clock_rate_accumulator += current_clock_rate; } // Update and record timers and counters: @@ -338,16 +338,30 @@ void measure_cold_base::generate_summaries() summ.set_string("hide", "Hidden by default."); } - if (!m_sm_clock_rates.empty()) + if (m_sm_clock_rate_accumulator != 0.) { - auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean"); - summ.set_string("name", "Clock Rate"); - summ.set_string("hint", "frequency"); - summ.set_string("description", "Mean SM clock rate"); - summ.set_string("hide", "Hidden by default."); - summ.set_float64("value", - nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(), - m_sm_clock_rates.cend())); + const auto clock_mean = m_sm_clock_rate_accumulator / d_samples; + + { + auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean"); + summ.set_string("name", "Clock Rate"); + summ.set_string("hint", "frequency"); + summ.set_string("description", "Mean SM clock rate"); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", clock_mean); + } + + { + const auto default_clock_rate = + static_cast(m_state.get_device()->get_sm_default_clock_rate()); + + auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/scaling/percent"); + summ.set_string("name", "Clock Scaling"); + summ.set_string("hint", "percentage"); + summ.set_string("description", "Mean SM clock rate as a percentage of default clock rate."); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", clock_mean / default_clock_rate); + } } // Log if a printer exists: diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 2138b3d..26eee62 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -101,7 +101,7 @@ protected: nvbench::float64_t m_skip_time{}; nvbench::float64_t m_timeout{}; - nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate] nvbench::float32_t m_throttle_recovery_delay; // [seconds] nvbench::int64_t m_total_samples{}; @@ -114,9 +114,10 @@ protected: nvbench::float64_t m_max_cpu_time{}; nvbench::float64_t m_total_cpu_time{}; + nvbench::float64_t m_sm_clock_rate_accumulator{}; + std::vector m_cuda_times; std::vector m_cpu_times; - std::vector m_sm_clock_rates; bool m_max_time_exceeded{}; }; diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 9b0a5c1..0691dc6 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -331,7 +331,7 @@ private: nvbench::float64_t m_skip_time; nvbench::float64_t m_timeout; - nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate] nvbench::float32_t m_throttle_recovery_delay; // [seconds] // Deadlock protection. See blocking_kernel's class doc for details.