Detect throttle

Signed-off-by: Georgy Evtushenko <evtushenko.georgy@gmail.com>
This commit is contained in:
Georgy Evtushenko
2025-04-07 22:13:09 -07:00
parent 36adf3a210
commit f29f7ac2fb
15 changed files with 435 additions and 33 deletions

View File

@@ -133,6 +133,23 @@
* Applies to the most recent `--benchmark`, or all benchmarks if specified * Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments. before any `--benchmark` arguments.
* `--discard-on-throttle`
* Discard measurements if the GPU is throttled.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--throttle-threshold <value>`
* Set the GPU throttle threshold as percentage of the peak clock rate.
* Default is 0.75 (75%).
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--throttle-recovery-delay <value>`
* Set the GPU throttle recovery delay in seconds.
* Default is 0.5 seconds.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--run-once` * `--run-once`
* Only run the benchmark once, skipping any warmup runs and batched * Only run the benchmark once, skipping any warmup runs and batched
measurements. measurements.

View File

@@ -29,6 +29,8 @@ set(srcs
detail/measure_hot.cu detail/measure_hot.cu
detail/state_generator.cxx detail/state_generator.cxx
detail/stdrel_criterion.cxx detail/stdrel_criterion.cxx
detail/gpu_frequency.cxx
detail/timestamps_kernel.cu
internal/nvml.cxx internal/nvml.cxx
) )

View File

@@ -247,12 +247,39 @@ struct benchmark_base
} }
/// @} /// @}
[[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; } [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
[[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; }
void set_throttle_threshold(nvbench::float32_t throttle_threshold)
{
m_throttle_threshold = throttle_threshold;
}
[[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
{
return m_throttle_recovery_delay;
}
void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
{
m_throttle_recovery_delay = throttle_recovery_delay;
}
[[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; }
void set_discard_on_throttle(bool discard_on_throttle)
{
m_discard_on_throttle = discard_on_throttle;
}
[[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; }
[[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
{
return m_criterion_params;
}
/// Control the stopping criterion for the measurement loop. /// Control the stopping criterion for the measurement loop.
/// @{ /// @{
[[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; } [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
benchmark_base &set_stopping_criterion(std::string criterion) benchmark_base &set_stopping_criterion(std::string criterion)
{ {
m_stopping_criterion = std::move(criterion); m_stopping_criterion = std::move(criterion);
@@ -282,6 +309,10 @@ protected:
nvbench::float64_t m_skip_time{-1.}; nvbench::float64_t m_skip_time{-1.};
nvbench::float64_t m_timeout{15.}; nvbench::float64_t m_timeout{15.};
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
bool m_discard_on_throttle{false};
nvbench::criterion_params m_criterion_params; nvbench::criterion_params m_criterion_params;
std::string m_stopping_criterion{"stdrel"}; std::string m_stopping_criterion{"stdrel"};

View File

@@ -17,7 +17,6 @@
*/ */
#include <nvbench/benchmark_base.cuh> #include <nvbench/benchmark_base.cuh>
#include <nvbench/detail/transform_reduce.cuh> #include <nvbench/detail/transform_reduce.cuh>
namespace nvbench namespace nvbench
@@ -45,7 +44,11 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
result->m_skip_time = m_skip_time; result->m_skip_time = m_skip_time;
result->m_timeout = m_timeout; result->m_timeout = m_timeout;
result->m_criterion_params = m_criterion_params; result->m_criterion_params = m_criterion_params;
result->m_throttle_threshold = m_throttle_threshold;
result->m_throttle_recovery_delay = m_throttle_recovery_delay;
result->m_discard_on_throttle = m_discard_on_throttle;
result->m_stopping_criterion = m_stopping_criterion; result->m_stopping_criterion = m_stopping_criterion;
return result; return result;

View File

@@ -0,0 +1,53 @@
/*
* Copyright 2025 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nvbench/detail/timestamps_kernel.cuh>
#include <nvbench/types.cuh>
namespace nvbench::detail
{
struct cuda_stream;
struct gpu_frequency
{
gpu_frequency() = default;
// move-only
gpu_frequency(const gpu_frequency &) = delete;
gpu_frequency(gpu_frequency &&) = default;
gpu_frequency &operator=(const gpu_frequency &) = delete;
gpu_frequency &operator=(gpu_frequency &&) = default;
void start(const nvbench::cuda_stream &stream) { m_start.record(stream); }
void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
[[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
nvbench::float32_t throttle_threshold);
[[nodiscard]] nvbench::float32_t get_clock_frequency();
private:
nvbench::detail::timestamps_kernel m_start;
nvbench::detail::timestamps_kernel m_stop;
};
} // namespace nvbench::detail

View File

@@ -0,0 +1,47 @@
/*
* Copyright 2025 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/detail/gpu_frequency.cuh>
#include <iostream>
namespace nvbench::detail
{
nvbench::float32_t gpu_frequency::get_clock_frequency()
{
nvbench::uint64_t elapsed_ns = m_stop.m_host_timestamps[0] - m_start.m_host_timestamps[0];
nvbench::uint64_t elapsed_clocks = m_stop.m_host_timestamps[1] - m_start.m_host_timestamps[1];
nvbench::float32_t clock_rate = float(elapsed_clocks) / float(elapsed_ns) * 1000000000.f;
return clock_rate;
}
bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
nvbench::float32_t throttle_threshold)
{
float threshold = peak_sm_clock_rate_hz * throttle_threshold;
if (this->get_clock_frequency() < threshold)
{
return true;
}
return false;
}
} // namespace nvbench::detail

View File

@@ -27,6 +27,8 @@
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <chrono>
#include <thread>
#include <fmt/format.h> #include <fmt/format.h>
@@ -44,6 +46,9 @@ measure_cold_base::measure_cold_base(state &exec_state)
, m_min_samples{exec_state.get_min_samples()} , m_min_samples{exec_state.get_min_samples()}
, m_skip_time{exec_state.get_skip_time()} , m_skip_time{exec_state.get_skip_time()}
, m_timeout{exec_state.get_timeout()} , m_timeout{exec_state.get_timeout()}
, m_throttle_threshold(exec_state.get_throttle_threshold())
, m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay())
, m_discard_on_throttle(exec_state.get_discard_on_throttle())
{ {
if (m_min_samples > 0) if (m_min_samples > 0)
{ {
@@ -86,6 +91,41 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
void measure_cold_base::record_measurements() void measure_cold_base::record_measurements()
{ {
if (!m_run_once)
{
auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
{
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
{
auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::warn,
fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
"({:0.0f}% < {:0.0f}%) on sample {}. {} previous sample and "
"pausing for {}s.",
current_clock_rate / 1000000.0f,
peak_clock_rate / 1000000.0f,
100.0f * (current_clock_rate / peak_clock_rate),
100.0f * m_throttle_threshold,
m_total_samples,
m_discard_on_throttle ? "Discarding" : "Keeping",
m_throttle_recovery_delay));
}
if (m_throttle_recovery_delay > 0.0f)
{ // let the GPU cool down
std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
}
if (m_discard_on_throttle)
{ // ignore this measurement
return;
}
}
}
// Update and record timers and counters: // Update and record timers and counters:
const auto cur_cuda_time = m_cuda_timer.get_duration(); const auto cur_cuda_time = m_cuda_timer.get_duration();
const auto cur_cpu_time = m_cpu_timer.get_duration(); const auto cur_cpu_time = m_cpu_timer.get_duration();

View File

@@ -30,6 +30,7 @@
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh> #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
#include <nvbench/detail/l2flush.cuh> #include <nvbench/detail/l2flush.cuh>
#include <nvbench/detail/statistics.cuh> #include <nvbench/detail/statistics.cuh>
#include <nvbench/detail/gpu_frequency.cuh>
#include <cuda_runtime.h> #include <cuda_runtime.h>
@@ -64,6 +65,8 @@ protected:
bool is_finished(); bool is_finished();
void run_trials_epilogue(); void run_trials_epilogue();
void generate_summaries(); void generate_summaries();
void gpu_frequency_start() { m_gpu_frequency.start(m_launch.get_stream()); }
void gpu_frequency_stop() { m_gpu_frequency.stop(m_launch.get_stream()); }
void check_skip_time(nvbench::float64_t warmup_time); void check_skip_time(nvbench::float64_t warmup_time);
@@ -88,6 +91,7 @@ protected:
nvbench::criterion_params m_criterion_params; nvbench::criterion_params m_criterion_params;
nvbench::stopping_criterion_base& m_stopping_criterion; nvbench::stopping_criterion_base& m_stopping_criterion;
nvbench::detail::gpu_frequency m_gpu_frequency;
bool m_disable_blocking_kernel{false}; bool m_disable_blocking_kernel{false};
bool m_run_once{false}; bool m_run_once{false};
@@ -97,6 +101,10 @@ protected:
nvbench::float64_t m_skip_time{}; nvbench::float64_t m_skip_time{};
nvbench::float64_t m_timeout{}; nvbench::float64_t m_timeout{};
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
bool m_discard_on_throttle{false};
nvbench::int64_t m_total_samples{}; nvbench::int64_t m_total_samples{};
nvbench::float64_t m_min_cuda_time{}; nvbench::float64_t m_min_cuda_time{};
@@ -128,6 +136,10 @@ struct measure_cold_base::kernel_launch_timer
{ {
m_measure.block_stream(); m_measure.block_stream();
} }
if (!m_measure.m_run_once)
{
m_measure.gpu_frequency_start();
}
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream()); m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
if (m_disable_blocking_kernel) if (m_disable_blocking_kernel)
{ {
@@ -143,6 +155,10 @@ struct measure_cold_base::kernel_launch_timer
m_measure.m_cpu_timer.start(); m_measure.m_cpu_timer.start();
m_measure.unblock_stream(); m_measure.unblock_stream();
} }
if (!m_measure.m_run_once)
{
m_measure.gpu_frequency_stop();
}
m_measure.sync_stream(); m_measure.sync_stream();
m_measure.m_cpu_timer.stop(); m_measure.m_cpu_timer.stop();
} }

View File

@@ -0,0 +1,82 @@
/*
* Copyright 2025 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cuda_runtime.h>
#include <nvbench/cuda_call.cuh>
#include <nvbench/cuda_stream.cuh>
#include <nvbench/detail/timestamps_kernel.cuh>
#include <nvbench/types.cuh>
#include <cstdio>
#include <cstdlib>
namespace
{
__global__ void get_timestamps_kernel(nvbench::uint64_t *global_timestamp,
nvbench::uint64_t *sm0_timestamp)
{
nvbench::uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
if (smid == 0)
{
nvbench::uint64_t gts, lts;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(gts));
lts = clock64();
*global_timestamp = gts;
*sm0_timestamp = lts;
}
}
} // namespace
namespace nvbench::detail
{
timestamps_kernel::timestamps_kernel()
{
NVBENCH_CUDA_CALL(
cudaHostRegister(&m_host_timestamps, sizeof(nvbench::uint64_t) * 2, cudaHostRegisterMapped));
NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timestamps, &m_host_timestamps, 0));
}
timestamps_kernel::~timestamps_kernel()
{
NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timestamps));
}
void timestamps_kernel::record(const nvbench::cuda_stream &stream)
{
m_host_timestamps[0] = 0;
m_host_timestamps[1] = 0;
int device_id = 0;
int num_sms = 0;
NVBENCH_CUDA_CALL(cudaGetDevice(&device_id));
NVBENCH_CUDA_CALL(
cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));
get_timestamps_kernel<<<static_cast<unsigned int>(num_sms), 1, 0, stream.get_stream()>>>(
m_device_timestamps,
m_device_timestamps + 1);
}
} // namespace nvbench

View File

@@ -0,0 +1,50 @@
/*
* Copyright 2025 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nvbench/types.cuh>
namespace nvbench
{
struct cuda_stream;
namespace detail
{
struct timestamps_kernel
{
timestamps_kernel();
~timestamps_kernel();
void record(const nvbench::cuda_stream &stream);
// move-only
timestamps_kernel(const timestamps_kernel &) = delete;
timestamps_kernel(timestamps_kernel &&) = default;
timestamps_kernel &operator=(const timestamps_kernel &) = delete;
timestamps_kernel &operator=(timestamps_kernel &&) = default;
nvbench::uint64_t m_host_timestamps[2];
nvbench::uint64_t *m_device_timestamps{};
};
} // namespace detail
} // namespace nvbench

View File

@@ -18,17 +18,18 @@
#pragma once #pragma once
#include <cuda_runtime_api.h>
#include <nvbench/config.cuh> #include <nvbench/config.cuh>
#include <nvbench/cuda_call.cuh> #include <nvbench/cuda_call.cuh>
#include <nvbench/detail/device_scope.cuh> #include <nvbench/detail/device_scope.cuh>
#include <cuda_runtime_api.h>
#include <cstdint> // CHAR_BIT #include <cstdint> // CHAR_BIT
#include <stdexcept> #include <stdexcept>
#include <string_view>
#include <utility> #include <utility>
#include <string_view>
// forward declare this for internal storage // forward declare this for internal storage
struct nvmlDevice_st; struct nvmlDevice_st;

View File

@@ -16,28 +16,24 @@
* limitations under the License. * limitations under the License.
*/ */
#include <nvbench/option_parser.cuh>
#include <nvbench/benchmark_base.cuh> #include <nvbench/benchmark_base.cuh>
#include <nvbench/benchmark_manager.cuh> #include <nvbench/benchmark_manager.cuh>
#include <nvbench/csv_printer.cuh>
#include <nvbench/criterion_manager.cuh> #include <nvbench/criterion_manager.cuh>
#include <nvbench/csv_printer.cuh>
#include <nvbench/detail/throw.cuh>
#include <nvbench/device_manager.cuh> #include <nvbench/device_manager.cuh>
#include <nvbench/git_revision.cuh> #include <nvbench/git_revision.cuh>
#include <nvbench/json_printer.cuh> #include <nvbench/json_printer.cuh>
#include <nvbench/markdown_printer.cuh> #include <nvbench/markdown_printer.cuh>
#include <nvbench/option_parser.cuh>
#include <nvbench/printer_base.cuh> #include <nvbench/printer_base.cuh>
#include <nvbench/range.cuh> #include <nvbench/range.cuh>
#include <nvbench/version.cuh> #include <nvbench/version.cuh>
#include <nvbench/detail/throw.cuh>
// These are generated from the markdown docs by CMake in the build directory: // These are generated from the markdown docs by CMake in the build directory:
#include <nvbench/internal/cli_help.cuh> #include <nvbench/internal/cli_help.cuh>
#include <nvbench/internal/cli_help_axis.cuh> #include <nvbench/internal/cli_help_axis.cuh>
#include <fmt/format.h>
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cstdlib> #include <cstdlib>
@@ -48,10 +44,12 @@
#include <regex> #include <regex>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <string_view>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include <fmt/format.h>
#include <string_view>
namespace namespace
{ {
@@ -433,6 +431,11 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
this->lock_gpu_clocks(first[1]); this->lock_gpu_clocks(first[1]);
first += 2; first += 2;
} }
else if (arg == "--discard-on-throttle")
{
this->enable_discard_on_throttle();
first += 1;
}
else if (arg == "--run-once") else if (arg == "--run-once")
{ {
this->enable_run_once(); this->enable_run_once();
@@ -515,7 +518,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
this->update_int64_prop(first[0], first[1]); this->update_int64_prop(first[0], first[1]);
first += 2; first += 2;
} }
else if (arg == "--skip-time" || arg == "--timeout") else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" ||
arg == "--throttle-recovery-delay")
{ {
check_params(1); check_params(1);
this->update_float64_prop(first[0], first[1]); this->update_float64_prop(first[0], first[1]);
@@ -623,7 +627,7 @@ void option_parser::print_version() const
NVBENCH_GIT_VERSION); NVBENCH_GIT_VERSION);
} }
void option_parser::print_list(printer_base& printer) const void option_parser::print_list(printer_base &printer) const
{ {
const auto &bench_mgr = nvbench::benchmark_manager::get(); const auto &bench_mgr = nvbench::benchmark_manager::get();
printer.print_device_info(); printer.print_device_info();
@@ -727,6 +731,18 @@ void option_parser::enable_run_once()
bench.set_run_once(true); bench.set_run_once(true);
} }
void option_parser::enable_discard_on_throttle()
{
if (m_benchmarks.empty())
{
m_global_benchmark_args.push_back("--discard-on-throttle");
return;
}
benchmark_base &bench = *m_benchmarks.back();
bench.set_discard_on_throttle(true);
}
void option_parser::set_stopping_criterion(const std::string &criterion) void option_parser::set_stopping_criterion(const std::string &criterion)
{ {
// If no active benchmark, save args as global. // If no active benchmark, save args as global.
@@ -979,10 +995,9 @@ catch (std::exception &e)
e.what()); e.what());
} }
void option_parser::update_criterion_prop( void option_parser::update_criterion_prop(const std::string &prop_arg,
const std::string &prop_arg, const std::string &prop_val,
const std::string &prop_val, const nvbench::named_values::type type)
const nvbench::named_values::type type)
try try
{ {
// If no active benchmark, save args as global. // If no active benchmark, save args as global.
@@ -993,8 +1008,8 @@ try
return; return;
} }
benchmark_base &bench = *m_benchmarks.back(); benchmark_base &bench = *m_benchmarks.back();
nvbench::criterion_params& criterion_params = bench.get_criterion_params(); nvbench::criterion_params &criterion_params = bench.get_criterion_params();
std::string name(prop_arg.begin() + 2, prop_arg.end()); std::string name(prop_arg.begin() + 2, prop_arg.end());
if (type == nvbench::named_values::type::float64) if (type == nvbench::named_values::type::float64)
{ {
@@ -1022,7 +1037,7 @@ try
NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg); NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
} }
} }
catch (std::exception& e) catch (std::exception &e)
{ {
NVBENCH_THROW(std::runtime_error, NVBENCH_THROW(std::runtime_error,
"Error handling option `{} {}`:\n{}", "Error handling option `{} {}`:\n{}",
@@ -1054,6 +1069,14 @@ try
{ {
bench.set_timeout(value); bench.set_timeout(value);
} }
else if (prop_arg == "--throttle-threshold")
{
bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value));
}
else if (prop_arg == "--throttle-recovery-delay")
{
bench.set_throttle_recovery_delay(static_cast<nvbench::float32_t>(value));
}
else else
{ {
NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg); NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);

View File

@@ -80,7 +80,7 @@ private:
std::ostream &printer_spec_to_ostream(const std::string &spec); std::ostream &printer_spec_to_ostream(const std::string &spec);
void print_version() const; void print_version() const;
void print_list(printer_base& printer) const; void print_list(printer_base &printer) const;
void print_help() const; void print_help() const;
void print_help_axis() const; void print_help_axis() const;
@@ -89,6 +89,7 @@ private:
void set_stopping_criterion(const std::string &criterion); void set_stopping_criterion(const std::string &criterion);
void enable_run_once(); void enable_run_once();
void enable_discard_on_throttle();
void disable_blocking_kernel(); void disable_blocking_kernel();
void add_benchmark(const std::string &name); void add_benchmark(const std::string &name);

View File

@@ -22,9 +22,9 @@
#include <nvbench/device_info.cuh> #include <nvbench/device_info.cuh>
#include <nvbench/exec_tag.cuh> #include <nvbench/exec_tag.cuh>
#include <nvbench/named_values.cuh> #include <nvbench/named_values.cuh>
#include <nvbench/stopping_criterion.cuh>
#include <nvbench/summary.cuh> #include <nvbench/summary.cuh>
#include <nvbench/types.cuh> #include <nvbench/types.cuh>
#include <nvbench/stopping_criterion.cuh>
#include <functional> #include <functional>
#include <optional> #include <optional>
@@ -136,8 +136,11 @@ struct state
/// Control the stopping criterion for the measurement loop. /// Control the stopping criterion for the measurement loop.
/// @{ /// @{
[[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; } [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
void set_stopping_criterion(std::string criterion) { m_stopping_criterion = std::move(criterion); } void set_stopping_criterion(std::string criterion)
{
m_stopping_criterion = std::move(criterion);
}
/// @} /// @}
/// If true, the benchmark is only run once, skipping all warmup runs and only /// If true, the benchmark is only run once, skipping all warmup runs and only
@@ -200,6 +203,30 @@ struct state
void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; } void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; }
/// @} /// @}
[[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
void set_throttle_threshold(nvbench::float32_t throttle_threshold)
{
m_throttle_threshold = throttle_threshold;
}
[[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
{
return m_throttle_recovery_delay;
}
void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
{
m_throttle_recovery_delay = throttle_recovery_delay;
}
[[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; }
void set_discard_on_throttle(bool discard_on_throttle)
{
m_discard_on_throttle = discard_on_throttle;
}
/// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed /// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed
/// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks /// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks
/// for more than `blocking_kernel_timeout` seconds, an error will be printed /// for more than `blocking_kernel_timeout` seconds, an error will be printed
@@ -311,6 +338,10 @@ private:
nvbench::float64_t m_skip_time; nvbench::float64_t m_skip_time;
nvbench::float64_t m_timeout; nvbench::float64_t m_timeout;
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
bool m_discard_on_throttle{false};
// Deadlock protection. See blocking_kernel's class doc for details. // Deadlock protection. See blocking_kernel's class doc for details.
nvbench::float64_t m_blocking_kernel_timeout{30.0}; nvbench::float64_t m_blocking_kernel_timeout{30.0};

View File

@@ -15,20 +15,19 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include <nvbench/state.cuh> #include <nvbench/state.cuh>
#include <nvbench/benchmark_base.cuh> #include <nvbench/benchmark_base.cuh>
#include <nvbench/detail/throw.cuh> #include <nvbench/detail/throw.cuh>
#include <nvbench/types.cuh> #include <nvbench/types.cuh>
#include <fmt/color.h>
#include <fmt/format.h>
#include <algorithm> #include <algorithm>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <fmt/color.h>
#include <fmt/format.h>
namespace nvbench namespace nvbench
{ {
@@ -42,6 +41,9 @@ state::state(const benchmark_base &bench)
, m_min_samples{bench.get_min_samples()} , m_min_samples{bench.get_min_samples()}
, m_skip_time{bench.get_skip_time()} , m_skip_time{bench.get_skip_time()}
, m_timeout{bench.get_timeout()} , m_timeout{bench.get_timeout()}
, m_throttle_threshold{bench.get_throttle_threshold()}
, m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
, m_discard_on_throttle{bench.get_discard_on_throttle()}
{} {}
state::state(const benchmark_base &bench, state::state(const benchmark_base &bench,
@@ -60,6 +62,9 @@ state::state(const benchmark_base &bench,
, m_min_samples{bench.get_min_samples()} , m_min_samples{bench.get_min_samples()}
, m_skip_time{bench.get_skip_time()} , m_skip_time{bench.get_skip_time()}
, m_timeout{bench.get_timeout()} , m_timeout{bench.get_timeout()}
, m_throttle_threshold{bench.get_throttle_threshold()}
, m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
, m_discard_on_throttle{bench.get_discard_on_throttle()}
{} {}
nvbench::int64_t state::get_int64(const std::string &axis_name) const nvbench::int64_t state::get_int64(const std::string &axis_name) const