Add termination criteria API.

- min_samples
- min_time
- max_noise
- skip_time (not yet implemented)
- timeout

Refactored s/(trials)|(iters)/samples/s.
This commit is contained in:
Allison Vacanti
2021-02-15 11:56:10 -05:00
parent e5914ff620
commit d323f569b8
9 changed files with 258 additions and 84 deletions

View File

@@ -16,8 +16,16 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
auto result = this->do_clone();
// Do not copy states.
result->m_name = m_name;
result->m_axes = m_axes;
result->m_name = m_name;
result->m_axes = m_axes;
result->m_devices = m_devices;
result->m_min_samples = m_min_samples;
result->m_min_time = m_min_time;
result->m_max_noise = m_max_noise;
result->m_skip_time = m_skip_time;
result->m_timeout = m_timeout;
return std::move(result);
}

View File

@@ -113,6 +113,67 @@ struct benchmark_base
void run() { this->do_run(); }
/// Execute at least this many trials per measurement. @{
[[nodiscard]] nvbench::int64_t get_min_samples() const
{
return m_min_samples;
}
benchmark_base &set_min_samples(nvbench::int64_t min_samples)
{
m_min_samples = min_samples;
return *this;
}
/// @}
/// Accumulate at least this many seconds of timing data per measurement. @{
[[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; }
benchmark_base &set_min_time(nvbench::float64_t min_time)
{
m_min_time = min_time;
return *this;
}
/// @}
/// Specify the maximum amount of noise if a measurement supports noise.
/// Noise is the relative standard deviation expressed as a percentage:
/// `noise = 100 * (stdev / mean_time)`. @{
[[nodiscard]] nvbench::float64_t get_max_noise() const { return m_max_noise; }
benchmark_base &set_max_noise(nvbench::float64_t max_noise)
{
m_max_noise = max_noise;
return *this;
}
/// @}
/// If a warmup run finishes in less than `skip_time`, the measurement will
/// be skipped.
/// Extremely fast kernels (< 5000 ns) often timeout before they can
/// accumulate `min_time` measurements, and are often uninteresting. Setting
/// this value can help improve performance by skipping time consuming
/// measurement that don't provide much information.
/// Default value is 0, which disable the feature.
/// @{
[[nodiscard]] nvbench::float64_t get_skip_time() const { return m_skip_time; }
benchmark_base &set_skip_time(nvbench::float64_t skip_time)
{
m_skip_time = skip_time;
return *this;
}
/// @}
/// If a measurement take more than `timeout` seconds to complete, stop the
/// measurement early. A warning should be printed if this happens.
/// This setting overrides all other termination criteria.
/// Note that this is measured in CPU walltime, not sample time.
/// @{
[[nodiscard]] nvbench::float64_t get_timeout() const { return m_timeout; }
benchmark_base &set_timeout(nvbench::float64_t timeout)
{
m_timeout = timeout;
return *this;
}
/// @}
protected:
template <typename BenchmarkType>
friend struct runner;
@@ -122,6 +183,13 @@ protected:
std::vector<nvbench::device_info> m_devices;
std::vector<nvbench::state> m_states;
nvbench::int64_t m_min_samples{10};
nvbench::float64_t m_min_time{0.5};
nvbench::float64_t m_max_noise{0.5};
nvbench::float64_t m_skip_time{0.};
nvbench::float64_t m_timeout{15.};
private:
// route these through virtuals so the templated subclass can inject type info
virtual std::unique_ptr<benchmark_base> do_clone() const = 0;

View File

@@ -17,6 +17,14 @@ namespace nvbench
namespace detail
{
measure_cold_base::measure_cold_base(state &exec_state)
: m_state{exec_state}
, m_min_samples{exec_state.get_min_samples()}
, m_max_noise{exec_state.get_max_noise()}
, m_min_time{exec_state.get_min_time()}
, m_timeout{exec_state.get_timeout()}
{}
void measure_cold_base::check()
{
const auto device = m_state.get_device();
@@ -38,8 +46,8 @@ void measure_cold_base::check()
void measure_cold_base::generate_summaries()
{
const auto d_iters = static_cast<double>(m_total_iters);
const auto avg_cuda_time = m_total_cuda_time / d_iters;
const auto d_samples = static_cast<double>(m_total_samples);
const auto avg_cuda_time = m_total_cuda_time / d_samples;
{
auto &summ = m_state.add_summary("Average GPU Time (Cold)");
summ.set_string("hint", "duration");
@@ -60,7 +68,7 @@ void measure_cold_base::generate_summaries()
summ.set_float64("value", m_cuda_noise);
}
const auto avg_cpu_time = m_total_cpu_time / d_iters;
const auto avg_cpu_time = m_total_cpu_time / d_samples;
{
auto &summ = m_state.add_summary("Average CPU Time (Cold)");
summ.set_string("hint", "duration");
@@ -82,11 +90,11 @@ void measure_cold_base::generate_summaries()
}
{
auto &summ = m_state.add_summary("Number of Trials (Cold)");
summ.set_string("short_name", "Trials");
auto &summ = m_state.add_summary("Number of Samples (Cold)");
summ.set_string("short_name", "Samples");
summ.set_string("description",
"Number of kernel executions in cold time measurements.");
summ.set_int64("value", m_total_iters);
summ.set_int64("value", m_total_samples);
}
// Log to stdout:
@@ -127,7 +135,7 @@ void measure_cold_base::generate_summaries()
avg_cuda_time * 1e3,
avg_cpu_time * 1e3,
m_total_cuda_time,
m_total_iters);
m_total_samples);
if (m_max_time_exceeded)
{
if (m_cuda_noise > m_max_noise)
@@ -137,12 +145,12 @@ void measure_cold_base::generate_summaries()
m_cuda_noise,
m_max_noise);
}
if (m_total_iters < m_min_iters)
if (m_total_samples < m_min_samples)
{
fmt::print("!!!! Previous benchmark exceeded max time before "
"accumulating min samples ({} < {})\n",
m_total_iters,
m_min_iters);
m_total_samples,
m_min_samples);
}
if (m_total_cuda_time < m_min_time)
{

View File

@@ -26,9 +26,7 @@ namespace detail
// non-templated code goes here:
struct measure_cold_base
{
explicit measure_cold_base(nvbench::state &exec_state)
: m_state(exec_state)
{}
explicit measure_cold_base(nvbench::state &exec_state);
measure_cold_base(const measure_cold_base &) = delete;
measure_cold_base(measure_cold_base &&) = delete;
measure_cold_base &operator=(const measure_cold_base &) = delete;
@@ -43,7 +41,7 @@ protected:
m_total_cpu_time = 0.;
m_cuda_noise = 0.;
m_cpu_noise = 0.;
m_total_iters = 0;
m_total_samples = 0;
m_cuda_times.clear();
m_cpu_times.clear();
m_max_time_exceeded = false;
@@ -59,18 +57,16 @@ protected:
nvbench::cpu_timer m_timeout_timer;
nvbench::detail::l2flush m_l2flush;
nvbench::int64_t m_min_iters{10};
nvbench::int64_t m_total_iters{};
nvbench::float64_t m_max_noise{0.5}; // % rel stdev
nvbench::float64_t m_cuda_noise{}; // % rel stdev
nvbench::float64_t m_cpu_noise{}; // % rel stdev
nvbench::float64_t m_min_time{0.5};
nvbench::float64_t m_max_time{5.0};
nvbench::int64_t m_min_samples{};
nvbench::float64_t m_max_noise{}; // % rel stdev
nvbench::float64_t m_min_time{};
nvbench::float64_t m_timeout{};
nvbench::int64_t m_total_samples{};
nvbench::float64_t m_total_cuda_time{};
nvbench::float64_t m_total_cpu_time{};
nvbench::float64_t m_cuda_noise{}; // % rel stdev
nvbench::float64_t m_cpu_noise{}; // % rel stdev
std::vector<nvbench::float64_t> m_cuda_times;
std::vector<nvbench::float64_t> m_cpu_times;
@@ -128,7 +124,7 @@ private:
m_cpu_times.push_back(cur_cpu_time);
m_total_cuda_time += cur_cuda_time;
m_total_cpu_time += cur_cpu_time;
++m_total_iters;
++m_total_samples;
// Only consider the cuda noise in the convergence criteria.
m_cuda_noise = nvbench::detail::compute_noise(m_cuda_times,
@@ -137,14 +133,14 @@ private:
m_timeout_timer.stop();
const auto total_time = m_timeout_timer.get_duration();
if (m_total_cuda_time > m_min_time && // Min time okay
m_total_iters > m_min_iters && // Min iters okay
m_cuda_noise < m_max_noise) // Noise okay
if (m_total_cuda_time > m_min_time && // Min time okay
m_total_samples > m_min_samples && // Min samples okay
m_cuda_noise < m_max_noise) // Noise okay
{
break;
}
if (total_time > m_max_time) // Max time exceeded, stop iterating.
if (total_time > m_timeout) // Max time exceeded, stop iterating.
{
m_max_time_exceeded = true;
break;

View File

@@ -36,35 +36,35 @@ void measure_hot_base::check()
}
measure_hot_base::measure_hot_base(state &exec_state)
: m_state(exec_state)
: m_state{exec_state}
, m_min_samples{exec_state.get_min_samples()}
, m_min_time{exec_state.get_min_time()}
, m_timeout{exec_state.get_timeout()}
{
// Since cold measures converge to a stable result, increase the min_iters
// Since cold measures converge to a stable result, increase the min_samples
// to match the cold result if available.
try
{
nvbench::int64_t cold_iters =
m_state.get_summary("Number of Trials (Cold)").get_int64("value");
m_min_iters = std::max(m_min_iters, cold_iters);
nvbench::int64_t cold_samples =
m_state.get_summary("Number of Samples (Cold)").get_int64("value");
m_min_samples = std::max(m_min_samples, cold_samples);
}
catch (...)
{
// TODO Need state API
// m_min_iters = state.get_min_trials();
//
// Apply the target_time since we don't have noise convergence estimates
// from the cold executions:
// TODO Need state API. Replace the following line with the commented one
const auto target_time = (m_min_time + m_max_time) / 2.;
// const auto target_time = state.get_target_time();
m_min_time = std::max(m_min_time, target_time);
// If the above threw an exception, we don't have a cold measurement to use.
// Estimate a target_time between m_min_time and m_timeout.
// Use the average of the min_time and timeout, but don't go over 5x
// min_time in case timeout is huge.
// We could expose a `target_time` property on benchmark_base/state if
// needed.
m_min_time = std::min((m_min_time + m_timeout) / 2., m_min_time * 5);
}
}
void measure_hot_base::generate_summaries()
{
const auto d_iters = static_cast<double>(m_total_iters);
const auto avg_cuda_time = m_total_cuda_time / d_iters;
const auto d_samples = static_cast<double>(m_total_samples);
const auto avg_cuda_time = m_total_cuda_time / d_samples;
{
auto &summ = m_state.add_summary("Average GPU Time (Hot)");
summ.set_string("hint", "duration");
@@ -75,7 +75,7 @@ void measure_hot_base::generate_summaries()
summ.set_float64("value", avg_cuda_time);
}
const auto avg_cpu_time = m_total_cpu_time / d_iters;
const auto avg_cpu_time = m_total_cpu_time / d_samples;
{
auto &summ = m_state.add_summary("Average CPU Time (Hot)");
summ.set_string("hide",
@@ -89,11 +89,11 @@ void measure_hot_base::generate_summaries()
}
{
auto &summ = m_state.add_summary("Number of Trials (Hot)");
summ.set_string("short_name", "Trials");
auto &summ = m_state.add_summary("Number of Samples (Hot)");
summ.set_string("short_name", "Samples");
summ.set_string("description",
"Number of kernel executions in hot time measurements.");
summ.set_int64("value", m_total_iters);
summ.set_int64("value", m_total_samples);
}
if (const auto items = m_state.get_items_processed_per_launch(); items != 0)
@@ -171,15 +171,15 @@ void measure_hot_base::generate_summaries()
avg_cuda_time * 1e3,
avg_cpu_time * 1e3,
m_total_cuda_time,
m_total_iters);
m_total_samples);
if (m_max_time_exceeded)
{
if (m_total_iters < m_min_iters)
if (m_total_samples < m_min_samples)
{
fmt::print("!!!! Previous benchmark exceeded max time before "
"accumulating min samples ({} < {})\n",
m_total_iters,
m_min_iters);
m_total_samples,
m_min_samples);
}
if (m_total_cuda_time < m_min_time)
{

View File

@@ -33,7 +33,7 @@ protected:
{
m_total_cpu_time = 0.;
m_total_cuda_time = 0.;
m_total_iters = 0;
m_total_samples = 0;
m_max_time_exceeded = false;
}
@@ -46,12 +46,11 @@ protected:
nvbench::cpu_timer m_cpu_timer;
nvbench::cpu_timer m_timeout_timer;
nvbench::int64_t m_total_iters{};
nvbench::int64_t m_min_iters{10};
nvbench::float64_t m_min_time{0.5};
nvbench::float64_t m_max_time{5.0};
nvbench::int64_t m_min_samples{};
nvbench::float64_t m_min_time{};
nvbench::float64_t m_timeout{};
nvbench::int64_t m_total_samples{};
nvbench::float64_t m_total_cuda_time{};
nvbench::float64_t m_total_cpu_time{};
@@ -102,7 +101,7 @@ private:
// Block stream until some work is queued.
// Limit the number of kernel executions while blocked to prevent
// deadlocks. See warnings on blocking_kernel.
const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
const auto unblocked_launches = batch_size - blocked_launches;
blocker.block(m_launch.get_stream());
@@ -129,22 +128,22 @@ private:
m_total_cpu_time += m_cpu_timer.get_duration();
m_total_cuda_time += m_cuda_timer.get_duration();
m_total_iters += batch_size;
m_total_samples += batch_size;
// Predict number of remaining iterations:
batch_size = (m_min_time - m_total_cuda_time) /
(m_total_cuda_time / m_total_iters);
(m_total_cuda_time / m_total_samples);
m_timeout_timer.stop();
const auto total_time = m_timeout_timer.get_duration();
if (m_total_cuda_time > m_min_time && // min time okay
m_total_iters > m_min_iters) // min iters okay
m_total_samples > m_min_samples) // min samples okay
{
break; // Stop iterating
}
if (m_total_cuda_time > m_max_time)
if (m_total_cuda_time > m_timeout)
{
m_max_time_exceeded = true;
break;

View File

@@ -1,5 +1,6 @@
#include <nvbench/state.cuh>
#include <nvbench/benchmark_base.cuh>
#include <nvbench/types.cuh>
#include <fmt/format.h>
@@ -11,6 +12,30 @@
namespace nvbench
{
state::state(const benchmark_base &bench)
: m_benchmark{bench}
, m_min_samples{bench.get_min_samples()}
, m_min_time{bench.get_min_time()}
, m_max_noise{bench.get_max_noise()}
, m_skip_time{bench.get_skip_time()}
, m_timeout{bench.get_timeout()}
{}
state::state(const benchmark_base &bench,
nvbench::named_values values,
std::optional<nvbench::device_info> device,
std::size_t type_config_index)
: m_benchmark{bench}
, m_axis_values{std::move(values)}
, m_device{std::move(device)}
, m_type_config_index{type_config_index}
, m_min_samples{bench.get_min_samples()}
, m_min_time{bench.get_min_time()}
, m_max_noise{bench.get_max_noise()}
, m_skip_time{bench.get_skip_time()}
, m_timeout{bench.get_timeout()}
{}
nvbench::int64_t state::get_int64(const std::string &axis_name) const
{
return m_axis_values.get_int64(axis_name);

View File

@@ -90,6 +90,50 @@ struct state
return m_skip_reason;
}
/// Execute at least this many trials per measurement. @{
[[nodiscard]] nvbench::int64_t get_min_samples() const
{
return m_min_samples;
}
void set_min_samples(nvbench::int64_t min_samples)
{
m_min_samples = min_samples;
}
/// @}
/// Accumulate at least this many seconds of timing data per measurement. @{
[[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; }
void set_min_time(nvbench::float64_t min_time) { m_min_time = min_time; }
/// @}
/// Specify the maximum amount of noise if a measurement supports noise.
/// Noise is the relative standard deviation expressed as a percentage:
/// `noise = 100 * (stdev / mean_time)`. @{
[[nodiscard]] nvbench::float64_t get_max_noise() const { return m_max_noise; }
void set_max_noise(nvbench::float64_t max_noise) { m_max_noise = max_noise; }
/// @}
/// If a warmup run finishes in less than `skip_time`, the measurement will
/// be skipped.
/// Extremely fast kernels (< 5000 ns) often timeout before they can
/// accumulate `min_time` measurements, and are often uninteresting. Setting
/// this value can help improve performance by skipping time consuming
/// measurement that don't provide much information.
/// Default value is 0, which disable the feature.
/// @{
[[nodiscard]] nvbench::float64_t get_skip_time() const { return m_skip_time; }
void set_skip_time(nvbench::float64_t skip_time) { m_skip_time = skip_time; }
/// @}
/// If a measurement take more than `timeout` seconds to complete, stop the
/// measurement early. A warning should be printed if this happens.
/// This setting overrides all other termination criteria.
/// Note that this is measured in CPU walltime, not sample time.
/// @{
[[nodiscard]] nvbench::float64_t get_timeout() const { return m_timeout; }
void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; }
/// @}
[[nodiscard]] const named_values &get_axis_values() const
{
return m_axis_values;
@@ -111,25 +155,25 @@ private:
friend struct nvbench::detail::state_generator;
friend struct nvbench::detail::state_tester;
explicit state(const benchmark_base &bench)
: m_benchmark{bench}
{}
explicit state(const benchmark_base &bench);
state(const benchmark_base &bench,
nvbench::named_values values,
std::optional<nvbench::device_info> device,
std::size_t type_config_index)
: m_benchmark{bench}
, m_axis_values{std::move(values)}
, m_device{std::move(device)}
, m_type_config_index{type_config_index}
{}
std::size_t type_config_index);
std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
nvbench::named_values m_axis_values;
std::optional<nvbench::device_info> m_device;
std::size_t m_type_config_index{};
nvbench::int64_t m_min_samples;
nvbench::float64_t m_min_time;
nvbench::float64_t m_max_noise;
nvbench::float64_t m_skip_time;
nvbench::float64_t m_timeout;
std::vector<nvbench::summary> m_summaries;
std::string m_skip_reason;
nvbench::int64_t m_items_processed_per_launch{};

View File

@@ -717,16 +717,10 @@ void test_devices()
ASSERT(states.size() == 12);
fmt::memory_buffer buffer;
const std::string table_format =
"| {:^5} | {:^6} | {:^5} | {:^3} |\n";
const std::string table_format = "| {:^5} | {:^6} | {:^5} | {:^3} |\n";
fmt::format_to(buffer, "\n");
fmt::format_to(buffer,
table_format,
"State",
"Device",
"S",
"I");
fmt::format_to(buffer, table_format, "State", "Device", "S", "I");
std::size_t config = 0;
for (const auto &state : states)
@@ -760,6 +754,36 @@ void test_devices()
ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
}
void test_termination_criteria()
{
const nvbench::int64_t min_samples = 1000;
const nvbench::float64_t min_time = 2000;
const nvbench::float64_t max_noise = 3000;
const nvbench::float64_t skip_time = 4000;
const nvbench::float64_t timeout = 5000;
// for comparing floats
auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; };
dummy_bench bench;
bench.set_devices(std::vector<int>{});
bench.set_min_samples(min_samples);
bench.set_min_time(min_time);
bench.set_max_noise(max_noise);
bench.set_skip_time(skip_time);
bench.set_timeout(timeout);
const std::vector<nvbench::state> states =
nvbench::detail::state_generator::create(bench);
ASSERT(states.size() == 1);
ASSERT(min_samples == states[0].get_min_samples());
ASSERT(within_one(min_time, states[0].get_min_time()));
ASSERT(within_one(max_noise, states[0].get_max_noise()));
ASSERT(within_one(skip_time, states[0].get_skip_time()));
ASSERT(within_one(timeout, states[0].get_timeout()));
}
int main()
try
{
@@ -770,9 +794,11 @@ try
test_create_with_types();
test_create_with_masked_types();
test_devices();
test_termination_criteria();
return 0;
}
catch (std::exception& e)
catch (std::exception &e)
{
fmt::print("{}\n", e.what());
return 1;