mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
Merge pull request #207 from alliepiper/throttle_followup
Throttling followup
This commit is contained in:
@@ -134,7 +134,7 @@
|
||||
before any `--benchmark` arguments.
|
||||
|
||||
* `--throttle-threshold <value>`
|
||||
* Set the GPU throttle threshold as percentage of the peak clock rate.
|
||||
* Set the GPU throttle threshold as percentage of the device's default clock rate.
|
||||
* Default is 75%.
|
||||
* Applies to the most recent `--benchmark`, or all benchmarks if specified
|
||||
before any `--benchmark` arguments.
|
||||
|
||||
@@ -41,7 +41,7 @@ void summary_example(nvbench::state &state)
|
||||
}
|
||||
|
||||
// Run the measurements:
|
||||
state.exec([duration](nvbench::launch &launch) {
|
||||
state.exec(nvbench::exec_tag::no_batch, [duration](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
|
||||
});
|
||||
|
||||
@@ -56,14 +56,17 @@ void summary_example(nvbench::state &state)
|
||||
#endif
|
||||
|
||||
// Default summary columns can be shown/hidden in the markdown output tables by adding/removing
|
||||
// the "hide" key. Modify this benchmark to show the minimum and maximum times, but hide the
|
||||
// means.
|
||||
// the "hide" key. Modify this benchmark to show the minimum and maximum GPUs times, but hide the
|
||||
// mean GPU time and all CPU times. SM Clock frequency and throttling info are also shown.
|
||||
state.get_summary("nv/cold/time/gpu/min").remove_value("hide");
|
||||
state.get_summary("nv/cold/time/gpu/max").remove_value("hide");
|
||||
state.get_summary("nv/cold/time/gpu/mean").set_string("hide", "");
|
||||
state.get_summary("nv/cold/time/cpu/min").remove_value("hide");
|
||||
state.get_summary("nv/cold/time/cpu/max").remove_value("hide");
|
||||
state.get_summary("nv/cold/time/cpu/mean").set_string("hide", "");
|
||||
state.get_summary("nv/cold/time/cpu/min").set_string("hide", "");
|
||||
state.get_summary("nv/cold/time/cpu/max").set_string("hide", "");
|
||||
state.get_summary("nv/cold/time/cpu/stdev/relative").set_string("hide", "");
|
||||
state.get_summary("nv/cold/sm_clock_rate/mean").remove_value("hide");
|
||||
state.get_summary("nv/cold/sm_clock_rate/scaling/percent").remove_value("hide");
|
||||
}
|
||||
NVBENCH_BENCH(summary_example)
|
||||
.add_int64_axis("ms", nvbench::range(10, 50, 20))
|
||||
|
||||
@@ -302,7 +302,7 @@ protected:
|
||||
nvbench::float64_t m_skip_time{-1.};
|
||||
nvbench::float64_t m_timeout{15.};
|
||||
|
||||
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
|
||||
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of default SM clock rate]
|
||||
nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]
|
||||
|
||||
nvbench::criterion_params m_criterion_params;
|
||||
|
||||
@@ -40,7 +40,7 @@ struct gpu_frequency
|
||||
|
||||
void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
|
||||
|
||||
[[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
|
||||
[[nodiscard]] bool has_throttled(nvbench::float32_t default_sm_clock_rate_hz,
|
||||
nvbench::float32_t throttle_threshold);
|
||||
|
||||
[[nodiscard]] nvbench::float32_t get_clock_frequency();
|
||||
|
||||
@@ -31,10 +31,10 @@ nvbench::float32_t gpu_frequency::get_clock_frequency()
|
||||
return clock_rate;
|
||||
}
|
||||
|
||||
bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
|
||||
bool gpu_frequency::has_throttled(nvbench::float32_t default_sm_clock_rate_hz,
|
||||
nvbench::float32_t throttle_threshold)
|
||||
{
|
||||
float threshold = peak_sm_clock_rate_hz * throttle_threshold;
|
||||
float threshold = default_sm_clock_rate_hz * throttle_threshold;
|
||||
|
||||
if (this->get_clock_frequency() < threshold)
|
||||
{
|
||||
|
||||
@@ -53,7 +53,6 @@ measure_cold_base::measure_cold_base(state &exec_state)
|
||||
{
|
||||
m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
m_sm_clock_rates.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,18 +71,18 @@ void measure_cold_base::check()
|
||||
|
||||
void measure_cold_base::initialize()
|
||||
{
|
||||
m_min_cuda_time = std::numeric_limits<nvbench::float64_t>::max();
|
||||
m_max_cuda_time = std::numeric_limits<nvbench::float64_t>::lowest();
|
||||
m_total_cuda_time = 0.;
|
||||
m_min_cpu_time = std::numeric_limits<nvbench::float64_t>::max();
|
||||
m_max_cpu_time = std::numeric_limits<nvbench::float64_t>::lowest();
|
||||
m_total_cpu_time = 0.;
|
||||
m_total_samples = 0;
|
||||
m_max_time_exceeded = false;
|
||||
m_min_cuda_time = std::numeric_limits<nvbench::float64_t>::max();
|
||||
m_max_cuda_time = std::numeric_limits<nvbench::float64_t>::lowest();
|
||||
m_total_cuda_time = 0.;
|
||||
m_min_cpu_time = std::numeric_limits<nvbench::float64_t>::max();
|
||||
m_max_cpu_time = std::numeric_limits<nvbench::float64_t>::lowest();
|
||||
m_total_cpu_time = 0.;
|
||||
m_sm_clock_rate_accumulator = 0.;
|
||||
m_total_samples = 0;
|
||||
m_max_time_exceeded = false;
|
||||
|
||||
m_cuda_times.clear();
|
||||
m_cpu_times.clear();
|
||||
m_sm_clock_rates.clear();
|
||||
|
||||
m_stopping_criterion.initialize(m_criterion_params);
|
||||
}
|
||||
@@ -94,21 +93,22 @@ void measure_cold_base::record_measurements()
|
||||
{
|
||||
if (!m_run_once)
|
||||
{
|
||||
auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
|
||||
const auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
|
||||
const auto default_clock_rate =
|
||||
static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
|
||||
|
||||
if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
|
||||
if (m_gpu_frequency.has_throttled(default_clock_rate, m_throttle_threshold))
|
||||
{
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
printer.log(nvbench::log_level::warn,
|
||||
fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
|
||||
"({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
|
||||
"and pausing for {}s.",
|
||||
current_clock_rate / 1000000.0f,
|
||||
peak_clock_rate / 1000000.0f,
|
||||
100.0f * (current_clock_rate / peak_clock_rate),
|
||||
default_clock_rate / 1000000.0f,
|
||||
100.0f * (current_clock_rate / default_clock_rate),
|
||||
100.0f * m_throttle_threshold,
|
||||
m_total_samples,
|
||||
m_throttle_recovery_delay));
|
||||
@@ -123,7 +123,7 @@ void measure_cold_base::record_measurements()
|
||||
return;
|
||||
}
|
||||
|
||||
m_sm_clock_rates.push_back(peak_clock_rate);
|
||||
m_sm_clock_rate_accumulator += current_clock_rate;
|
||||
}
|
||||
|
||||
// Update and record timers and counters:
|
||||
@@ -338,16 +338,30 @@ void measure_cold_base::generate_summaries()
|
||||
summ.set_string("hide", "Hidden by default.");
|
||||
}
|
||||
|
||||
if (!m_sm_clock_rates.empty())
|
||||
if (m_sm_clock_rate_accumulator != 0.)
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
|
||||
summ.set_string("name", "Clock Rate");
|
||||
summ.set_string("hint", "frequency");
|
||||
summ.set_string("description", "Mean SM clock rate");
|
||||
summ.set_string("hide", "Hidden by default.");
|
||||
summ.set_float64("value",
|
||||
nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(),
|
||||
m_sm_clock_rates.cend()));
|
||||
const auto clock_mean = m_sm_clock_rate_accumulator / d_samples;
|
||||
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
|
||||
summ.set_string("name", "Clock Rate");
|
||||
summ.set_string("hint", "frequency");
|
||||
summ.set_string("description", "Mean SM clock rate");
|
||||
summ.set_string("hide", "Hidden by default.");
|
||||
summ.set_float64("value", clock_mean);
|
||||
}
|
||||
|
||||
{
|
||||
const auto default_clock_rate =
|
||||
static_cast<nvbench::float64_t>(m_state.get_device()->get_sm_default_clock_rate());
|
||||
|
||||
auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/scaling/percent");
|
||||
summ.set_string("name", "Clock Scaling");
|
||||
summ.set_string("hint", "percentage");
|
||||
summ.set_string("description", "Mean SM clock rate as a percentage of default clock rate.");
|
||||
summ.set_string("hide", "Hidden by default.");
|
||||
summ.set_float64("value", clock_mean / default_clock_rate);
|
||||
}
|
||||
}
|
||||
|
||||
// Log if a printer exists:
|
||||
|
||||
@@ -101,7 +101,7 @@ protected:
|
||||
nvbench::float64_t m_skip_time{};
|
||||
nvbench::float64_t m_timeout{};
|
||||
|
||||
nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate]
|
||||
nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate]
|
||||
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
|
||||
|
||||
nvbench::int64_t m_total_samples{};
|
||||
@@ -114,9 +114,10 @@ protected:
|
||||
nvbench::float64_t m_max_cpu_time{};
|
||||
nvbench::float64_t m_total_cpu_time{};
|
||||
|
||||
nvbench::float64_t m_sm_clock_rate_accumulator{};
|
||||
|
||||
std::vector<nvbench::float64_t> m_cuda_times;
|
||||
std::vector<nvbench::float64_t> m_cpu_times;
|
||||
std::vector<nvbench::float32_t> m_sm_clock_rates;
|
||||
|
||||
bool m_max_time_exceeded{};
|
||||
};
|
||||
|
||||
@@ -331,7 +331,7 @@ private:
|
||||
nvbench::float64_t m_skip_time;
|
||||
nvbench::float64_t m_timeout;
|
||||
|
||||
nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate]
|
||||
nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate]
|
||||
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
|
||||
|
||||
// Deadlock protection. See blocking_kernel's class doc for details.
|
||||
|
||||
Reference in New Issue
Block a user