mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-05-11 17:00:01 +00:00
Rename --warmup-runs to --cold-warmup-runs, add --cold-max-warmup-walltime
Since configurable number of warmups only applies to measure_cold.cuh rename the CLI option to reflect that. Also add --cold-max-warmup-walltime (defaults to -1, i.e. disabled). If enabled, exits warmup loop before request count is reached if the wall-time expanded executign warmups exceeds this max-warmup-walltime value.
This commit is contained in:
@@ -93,13 +93,21 @@
|
||||
* Applies to the most recent `--benchmark`, or all benchmarks if specified
|
||||
before any `--benchmark` arguments.
|
||||
|
||||
* `--warmup-runs <count>`
|
||||
* Execute `<count>` warmup runs before collecting cold measurement samples.
|
||||
* `--cold-warmup-runs <count>`
|
||||
* Execute up to `<count>` warmup runs before collecting cold measurement samples.
|
||||
* The minimum is 1 warmup run.
|
||||
* Default is 1 warmup run.
|
||||
* Applies to the most recent `--benchmark`, or all benchmarks if specified
|
||||
before any `--benchmark` arguments.
|
||||
|
||||
* `--cold-max-warmup-walltime <seconds>`
|
||||
* Stop cold warmup after the total warmup walltime exceeds `<seconds>`.
|
||||
* The limit is checked after each warmup run, so actual warmup time may exceed
|
||||
this value by one warmup run.
|
||||
* Default is -1 seconds (disabled).
|
||||
* Applies to the most recent `--benchmark`, or all benchmarks if specified
|
||||
before any `--benchmark` arguments.
|
||||
|
||||
* `--throttle-threshold <value>`
|
||||
* Set the GPU throttle threshold as percentage of the device's default clock rate.
|
||||
* Default is 75.
|
||||
|
||||
@@ -167,10 +167,23 @@ struct benchmark_base
|
||||
/// @}
|
||||
|
||||
/// Execute this many warmup runs before collecting cold measurement samples. @{
|
||||
[[nodiscard]] nvbench::int64_t get_warmup_runs() const { return m_warmup_runs; }
|
||||
benchmark_base &set_warmup_runs(nvbench::int64_t warmup_runs)
|
||||
[[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
|
||||
benchmark_base &set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
|
||||
{
|
||||
m_warmup_runs = warmup_runs > nvbench::int64_t{0} ? warmup_runs : nvbench::int64_t{1};
|
||||
m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
|
||||
: nvbench::int64_t{1};
|
||||
return *this;
|
||||
}
|
||||
/// @}
|
||||
|
||||
/// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
|
||||
[[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
|
||||
{
|
||||
return m_cold_max_warmup_walltime;
|
||||
}
|
||||
benchmark_base &set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
|
||||
{
|
||||
m_cold_max_warmup_walltime = cold_max_warmup_walltime;
|
||||
return *this;
|
||||
}
|
||||
/// @}
|
||||
@@ -330,8 +343,9 @@ protected:
|
||||
bool m_skip_batched{false};
|
||||
|
||||
nvbench::int64_t m_min_samples{10};
|
||||
nvbench::int64_t m_warmup_runs{1};
|
||||
nvbench::int64_t m_cold_warmup_runs{1};
|
||||
|
||||
nvbench::float64_t m_cold_max_warmup_walltime{-1.};
|
||||
nvbench::float64_t m_skip_time{-1.};
|
||||
nvbench::float64_t m_timeout{15.};
|
||||
|
||||
|
||||
@@ -43,8 +43,9 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
|
||||
result->m_run_once = m_run_once;
|
||||
result->m_disable_blocking_kernel = m_disable_blocking_kernel;
|
||||
|
||||
result->m_min_samples = m_min_samples;
|
||||
result->m_warmup_runs = m_warmup_runs;
|
||||
result->m_min_samples = m_min_samples;
|
||||
result->m_cold_warmup_runs = m_cold_warmup_runs;
|
||||
result->m_cold_max_warmup_walltime = m_cold_max_warmup_walltime;
|
||||
|
||||
result->m_skip_time = m_skip_time;
|
||||
result->m_timeout = m_timeout;
|
||||
|
||||
@@ -46,7 +46,8 @@ measure_cold_base::measure_cold_base(state &exec_state)
|
||||
, m_run_once{exec_state.get_run_once()}
|
||||
, m_check_throttling(!exec_state.get_run_once())
|
||||
, m_min_samples{exec_state.get_min_samples()}
|
||||
, m_warmup_runs{exec_state.get_warmup_runs()}
|
||||
, m_cold_warmup_runs{exec_state.get_cold_warmup_runs()}
|
||||
, m_cold_max_warmup_walltime{exec_state.get_cold_max_warmup_walltime()}
|
||||
, m_skip_time{exec_state.get_skip_time()}
|
||||
, m_timeout{exec_state.get_timeout()}
|
||||
, m_throttle_threshold(exec_state.get_throttle_threshold())
|
||||
|
||||
@@ -110,8 +110,9 @@ protected:
|
||||
bool m_check_throttling{true};
|
||||
|
||||
nvbench::int64_t m_min_samples{};
|
||||
nvbench::int64_t m_warmup_runs{1};
|
||||
nvbench::int64_t m_cold_warmup_runs{1};
|
||||
|
||||
nvbench::float64_t m_cold_max_warmup_walltime{};
|
||||
nvbench::float64_t m_skip_time{};
|
||||
nvbench::float64_t m_timeout{};
|
||||
|
||||
@@ -240,7 +241,7 @@ struct measure_cold : public measure_cold_base
|
||||
}
|
||||
|
||||
private:
|
||||
// Run the kernel m_warmup_runs times, measuring the GPU time of the last run.
|
||||
// Run the kernel m_cold_warmup_runs times, measuring the GPU time of the last run.
|
||||
// If under skip_time, skip the measurement.
|
||||
void run_warmup()
|
||||
{
|
||||
@@ -258,10 +259,19 @@ private:
|
||||
// see https://github.com/NVIDIA/nvbench/issues/240
|
||||
constexpr bool disable_blocking_kernel = true;
|
||||
kernel_launch_timer timer(*this, disable_blocking_kernel);
|
||||
nvbench::cpu_timer warmup_walltime_timer;
|
||||
|
||||
for (nvbench::int64_t warmup_run = 0; warmup_run < m_warmup_runs; ++warmup_run)
|
||||
warmup_walltime_timer.start();
|
||||
for (nvbench::int64_t warmup_run = 0; warmup_run < m_cold_warmup_runs; ++warmup_run)
|
||||
{
|
||||
this->launch_kernel(timer);
|
||||
warmup_walltime_timer.stop();
|
||||
|
||||
if (m_cold_max_warmup_walltime > 0. &&
|
||||
warmup_walltime_timer.get_duration() > m_cold_max_warmup_walltime)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
this->check_skip_time(m_cuda_timer.get_duration());
|
||||
}
|
||||
|
||||
@@ -429,10 +429,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
bench["name"] = bench_ptr->get_name();
|
||||
bench["index"] = bench_index;
|
||||
|
||||
bench["min_samples"] = bench_ptr->get_min_samples();
|
||||
bench["warmup_runs"] = bench_ptr->get_warmup_runs();
|
||||
bench["skip_time"] = bench_ptr->get_skip_time();
|
||||
bench["timeout"] = bench_ptr->get_timeout();
|
||||
bench["min_samples"] = bench_ptr->get_min_samples();
|
||||
bench["cold_warmup_runs"] = bench_ptr->get_cold_warmup_runs();
|
||||
bench["cold_max_warmup_walltime"] = bench_ptr->get_cold_max_warmup_walltime();
|
||||
bench["skip_time"] = bench_ptr->get_skip_time();
|
||||
bench["timeout"] = bench_ptr->get_timeout();
|
||||
|
||||
auto &devices = bench["devices"];
|
||||
for (const auto &dev_info : bench_ptr->get_devices())
|
||||
@@ -487,10 +488,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
|
||||
|
||||
st["name"] = exec_state.get_axis_values_as_string();
|
||||
|
||||
st["min_samples"] = exec_state.get_min_samples();
|
||||
st["warmup_runs"] = exec_state.get_warmup_runs();
|
||||
st["skip_time"] = exec_state.get_skip_time();
|
||||
st["timeout"] = exec_state.get_timeout();
|
||||
st["min_samples"] = exec_state.get_min_samples();
|
||||
st["cold_warmup_runs"] = exec_state.get_cold_warmup_runs();
|
||||
st["cold_max_warmup_walltime"] = exec_state.get_cold_max_warmup_walltime();
|
||||
st["skip_time"] = exec_state.get_skip_time();
|
||||
st["timeout"] = exec_state.get_timeout();
|
||||
|
||||
st["device"] = exec_state.get_device()->get_id();
|
||||
st["type_config_index"] = exec_state.get_type_config_index();
|
||||
|
||||
@@ -526,14 +526,14 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
|
||||
this->update_axis(first[1]);
|
||||
first += 2;
|
||||
}
|
||||
else if (arg == "--min-samples" || arg == "--warmup-runs")
|
||||
else if (arg == "--min-samples" || arg == "--cold-warmup-runs")
|
||||
{
|
||||
check_params(1);
|
||||
this->update_int64_prop(first[0], first[1]);
|
||||
first += 2;
|
||||
}
|
||||
else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" ||
|
||||
arg == "--throttle-recovery-delay")
|
||||
else if (arg == "--skip-time" || arg == "--timeout" || arg == "--cold-max-warmup-walltime" ||
|
||||
arg == "--throttle-threshold" || arg == "--throttle-recovery-delay")
|
||||
{
|
||||
check_params(1);
|
||||
this->update_float64_prop(first[0], first[1]);
|
||||
@@ -990,9 +990,9 @@ try
|
||||
{
|
||||
bench.set_min_samples(value);
|
||||
}
|
||||
else if (prop_arg == "--warmup-runs")
|
||||
else if (prop_arg == "--cold-warmup-runs")
|
||||
{
|
||||
bench.set_warmup_runs(value);
|
||||
bench.set_cold_warmup_runs(value);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1107,6 +1107,10 @@ try
|
||||
{
|
||||
bench.set_timeout(value);
|
||||
}
|
||||
else if (prop_arg == "--cold-max-warmup-walltime")
|
||||
{
|
||||
bench.set_cold_max_warmup_walltime(value);
|
||||
}
|
||||
else if (prop_arg == "--throttle-threshold")
|
||||
{
|
||||
bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value) / 100.0f);
|
||||
|
||||
@@ -153,10 +153,22 @@ struct state
|
||||
/// @}
|
||||
|
||||
/// Execute this many warmup runs before collecting cold measurement samples. @{
|
||||
[[nodiscard]] nvbench::int64_t get_warmup_runs() const { return m_warmup_runs; }
|
||||
void set_warmup_runs(nvbench::int64_t warmup_runs)
|
||||
[[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
|
||||
void set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
|
||||
{
|
||||
m_warmup_runs = warmup_runs > nvbench::int64_t{0} ? warmup_runs : nvbench::int64_t{1};
|
||||
m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
|
||||
: nvbench::int64_t{1};
|
||||
}
|
||||
/// @}
|
||||
|
||||
/// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
|
||||
[[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
|
||||
{
|
||||
return m_cold_max_warmup_walltime;
|
||||
}
|
||||
void set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
|
||||
{
|
||||
m_cold_max_warmup_walltime = cold_max_warmup_walltime;
|
||||
}
|
||||
/// @}
|
||||
|
||||
@@ -340,8 +352,9 @@ private:
|
||||
std::string m_stopping_criterion;
|
||||
|
||||
nvbench::int64_t m_min_samples;
|
||||
nvbench::int64_t m_warmup_runs;
|
||||
nvbench::int64_t m_cold_warmup_runs;
|
||||
|
||||
nvbench::float64_t m_cold_max_warmup_walltime;
|
||||
nvbench::float64_t m_skip_time;
|
||||
nvbench::float64_t m_timeout;
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@ state::state(const benchmark_base &bench)
|
||||
, m_criterion_params{bench.get_criterion_params()}
|
||||
, m_stopping_criterion(bench.get_stopping_criterion())
|
||||
, m_min_samples{bench.get_min_samples()}
|
||||
, m_warmup_runs{bench.get_warmup_runs()}
|
||||
, m_cold_warmup_runs{bench.get_cold_warmup_runs()}
|
||||
, m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
|
||||
, m_skip_time{bench.get_skip_time()}
|
||||
, m_timeout{bench.get_timeout()}
|
||||
, m_throttle_threshold{bench.get_throttle_threshold()}
|
||||
@@ -62,7 +63,8 @@ state::state(const benchmark_base &bench,
|
||||
, m_criterion_params{bench.get_criterion_params()}
|
||||
, m_stopping_criterion(bench.get_stopping_criterion())
|
||||
, m_min_samples{bench.get_min_samples()}
|
||||
, m_warmup_runs{bench.get_warmup_runs()}
|
||||
, m_cold_warmup_runs{bench.get_cold_warmup_runs()}
|
||||
, m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
|
||||
, m_skip_time{bench.get_skip_time()}
|
||||
, m_timeout{bench.get_timeout()}
|
||||
, m_throttle_threshold{bench.get_throttle_threshold()}
|
||||
|
||||
@@ -1155,33 +1155,33 @@ void test_min_samples()
|
||||
ASSERT(states[0].get_min_samples() == 12345);
|
||||
}
|
||||
|
||||
void test_warmup_runs()
|
||||
void test_cold_warmup_runs()
|
||||
{
|
||||
{
|
||||
nvbench::option_parser parser;
|
||||
parser.parse({"--benchmark", "DummyBench", "--warmup-runs", "12345"});
|
||||
parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "12345"});
|
||||
const auto &states = parser_to_states(parser);
|
||||
|
||||
ASSERT(states.size() == 1);
|
||||
ASSERT(states[0].get_warmup_runs() == 12345);
|
||||
ASSERT(states[0].get_cold_warmup_runs() == 12345);
|
||||
}
|
||||
|
||||
{
|
||||
nvbench::option_parser parser;
|
||||
parser.parse({"--benchmark", "DummyBench", "--warmup-runs", "0"});
|
||||
parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "0"});
|
||||
const auto &states = parser_to_states(parser);
|
||||
|
||||
ASSERT(states.size() == 1);
|
||||
ASSERT(states[0].get_warmup_runs() == 1);
|
||||
ASSERT(states[0].get_cold_warmup_runs() == 1);
|
||||
}
|
||||
|
||||
{
|
||||
nvbench::option_parser parser;
|
||||
parser.parse({"--benchmark", "DummyBench", "--warmup-runs", "-12345"});
|
||||
parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "-12345"});
|
||||
const auto &states = parser_to_states(parser);
|
||||
|
||||
ASSERT(states.size() == 1);
|
||||
ASSERT(states[0].get_warmup_runs() == 1);
|
||||
ASSERT(states[0].get_cold_warmup_runs() == 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1195,6 +1195,16 @@ void test_skip_time()
|
||||
ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.);
|
||||
}
|
||||
|
||||
void test_cold_max_warmup_walltime()
|
||||
{
|
||||
nvbench::option_parser parser;
|
||||
parser.parse({"--benchmark", "DummyBench", "--cold-max-warmup-walltime", "12345e2"});
|
||||
const auto &states = parser_to_states(parser);
|
||||
|
||||
ASSERT(states.size() == 1);
|
||||
ASSERT(std::abs(states[0].get_cold_max_warmup_walltime() - 12345e2) < 1.);
|
||||
}
|
||||
|
||||
void test_timeout()
|
||||
{
|
||||
nvbench::option_parser parser;
|
||||
@@ -1496,8 +1506,9 @@ try
|
||||
test_axis_before_benchmark();
|
||||
|
||||
test_min_samples();
|
||||
test_warmup_runs();
|
||||
test_cold_warmup_runs();
|
||||
test_skip_time();
|
||||
test_cold_max_warmup_walltime();
|
||||
test_timeout();
|
||||
|
||||
test_stopping_criterion();
|
||||
|
||||
@@ -762,10 +762,11 @@ void test_devices()
|
||||
|
||||
void test_termination_criteria()
|
||||
{
|
||||
const nvbench::int64_t min_samples = 1000;
|
||||
const nvbench::int64_t warmup_runs = 7;
|
||||
const nvbench::float64_t skip_time = 4000;
|
||||
const nvbench::float64_t timeout = 5000;
|
||||
const nvbench::int64_t min_samples = 1000;
|
||||
const nvbench::int64_t cold_warmup_runs = 7;
|
||||
const nvbench::float64_t cold_max_warmup_walltime = 3000;
|
||||
const nvbench::float64_t skip_time = 4000;
|
||||
const nvbench::float64_t timeout = 5000;
|
||||
|
||||
// for comparing floats
|
||||
auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; };
|
||||
@@ -773,7 +774,8 @@ void test_termination_criteria()
|
||||
dummy_bench bench;
|
||||
bench.set_devices(std::vector<int>{});
|
||||
bench.set_min_samples(min_samples);
|
||||
bench.set_warmup_runs(warmup_runs);
|
||||
bench.set_cold_warmup_runs(cold_warmup_runs);
|
||||
bench.set_cold_max_warmup_walltime(cold_max_warmup_walltime);
|
||||
bench.set_skip_time(skip_time);
|
||||
bench.set_timeout(timeout);
|
||||
|
||||
@@ -781,7 +783,8 @@ void test_termination_criteria()
|
||||
|
||||
ASSERT(states.size() == 1);
|
||||
ASSERT(min_samples == states[0].get_min_samples());
|
||||
ASSERT(warmup_runs == states[0].get_warmup_runs());
|
||||
ASSERT(cold_warmup_runs == states[0].get_cold_warmup_runs());
|
||||
ASSERT(within_one(cold_max_warmup_walltime, states[0].get_cold_max_warmup_walltime()));
|
||||
ASSERT(within_one(skip_time, states[0].get_skip_time()));
|
||||
ASSERT(within_one(timeout, states[0].get_timeout()));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user