Implement new convergence check for noisy kernels.

Previously, convergence was tested by waiting for the relative stdev
of cuda timings ("noise") to drop below a certain percentage
(`max_noise`).

This assumed that all benchmarks would eventually see their noise drop
to some threshold, but this is not the case. In practice, many benchmarks
never converge to the default 0.5% relative stdev and instead will always
run to the 15s timeout -- even if the means have converged in a second
or two.

Added a new check that tests when the noise itself stabilizes and ends
the benchmark, even if noise > max_noise.

After testing, this patch alone significantly reduces the runtime of the
Thrust+CUB benchmark suite (from 30 hours to 5 hours) and produces similar
timing results.

The parameters used to tune this feature are not exposed -- if this
approach works long-term and there's a strong motivation to let users
tweak them, then we can worry about names/APIs/CLI/docs later.
This commit is contained in:
Allison Vacanti
2021-12-21 21:16:17 -05:00
parent 8e56a7bd94
commit 178dd0eb68
6 changed files with 382 additions and 87 deletions

View File

@@ -19,12 +19,14 @@
#include <nvbench/detail/measure_cold.cuh>
#include <nvbench/benchmark_base.cuh>
#include <nvbench/detail/throw.cuh>
#include <nvbench/device_info.cuh>
#include <nvbench/printer_base.cuh>
#include <nvbench/state.cuh>
#include <nvbench/summary.cuh>
#include <nvbench/detail/ring_buffer.cuh>
#include <nvbench/detail/throw.cuh>
#include <fmt/format.h>
#include <algorithm>
@@ -43,7 +45,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
, m_min_time{exec_state.get_min_time()}
, m_skip_time{exec_state.get_skip_time()}
, m_timeout{exec_state.get_timeout()}
{ }
{}
void measure_cold_base::check()
{
@@ -62,6 +64,113 @@ void measure_cold_base::check()
}
}
void measure_cold_base::initialize()
{
m_total_cuda_time = 0.;
m_total_cpu_time = 0.;
m_cpu_noise = 0.;
m_total_samples = 0;
m_noise_tracker.clear();
m_cuda_times.clear();
m_cpu_times.clear();
m_max_time_exceeded = false;
}
void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
void measure_cold_base::record_measurements()
{
// Update and record timers and counters:
const auto cur_cuda_time = m_cuda_timer.get_duration();
const auto cur_cpu_time = m_cpu_timer.get_duration();
m_cuda_times.push_back(cur_cuda_time);
m_cpu_times.push_back(cur_cpu_time);
m_total_cuda_time += cur_cuda_time;
m_total_cpu_time += cur_cpu_time;
++m_total_samples;
// Compute convergence statistics using CUDA timings:
const auto mean_cuda_time = m_total_cuda_time /
static_cast<nvbench::float64_t>(m_total_samples);
const auto cuda_stdev =
nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
m_cuda_times.cend(),
mean_cuda_time);
auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
if (std::isfinite(cuda_rel_stdev))
{
m_noise_tracker.push_back(cuda_rel_stdev);
}
}
bool measure_cold_base::is_finished()
{
if (m_run_once)
{
return true;
}
// Check that we've gathered enough samples:
if (m_total_cuda_time > m_min_time && m_total_samples > m_min_samples)
{
// Noise has dropped below threshold
if (m_noise_tracker.back() < m_max_noise)
{
return true;
}
// Check if the noise (cuda rel stdev) has converged by inspecting a
// trailing window of recorded noise measurements.
// This helps identify benchmarks that are inherently noisy and would
// never converge to the target stdev threshold. This check ensures that the
// benchmark will end if the stdev stabilizes above the target threshold.
// Gather some iterations before checking noise, and limit how often we
// check this.
if (m_noise_tracker.size() > 64 && (m_total_samples % 16 == 0))
{
// Use the current noise as the stdev reference.
const auto current_noise = m_noise_tracker.back();
const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
m_noise_tracker.cbegin(),
m_noise_tracker.cend(),
current_noise);
const auto noise_rel_stdev = noise_stdev / current_noise;
// If the rel stdev of the last N cuda noise measurements is less than
// 5%, consider the result stable.
const auto noise_threshold = 0.05;
if (noise_rel_stdev < noise_threshold)
{
return true;
}
}
}
// Check for timeouts:
m_walltime_timer.stop();
if (m_walltime_timer.get_duration() > m_timeout)
{
m_max_time_exceeded = true;
return true;
}
return false;
}
void measure_cold_base::run_trials_epilogue()
{
// Only need to compute this at the end, not per iteration.
const auto cpu_mean = m_total_cuda_time /
static_cast<nvbench::float64_t>(m_total_samples);
const auto cpu_stdev =
nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
m_cpu_times.cend(),
m_total_cpu_time);
m_cpu_noise = cpu_stdev / cpu_mean;
m_walltime_timer.stop();
}
void measure_cold_base::generate_summaries()
{
const auto d_samples = static_cast<double>(m_total_samples);
@@ -113,7 +222,10 @@ void measure_cold_base::generate_summaries()
summ.set_string("description",
"Relative standard deviation of the cold GPU execution "
"time measurements.");
summ.set_float64("value", m_cuda_noise);
summ.set_float64("value",
m_noise_tracker.empty()
? std::numeric_limits<nvbench::float64_t>::infinity()
: m_noise_tracker.back());
}
if (const auto items = m_state.get_element_count(); items != 0)
@@ -161,16 +273,16 @@ void measure_cold_base::generate_summaries()
if (m_max_time_exceeded)
{
const auto timeout = m_timeout_timer.get_duration();
const auto timeout = m_walltime_timer.get_duration();
if (m_cuda_noise > m_max_noise)
if (!m_noise_tracker.empty() && m_noise_tracker.back() > m_max_noise)
{
printer.log(nvbench::log_level::warn,
fmt::format("Current measurement timed out ({:0.2f}s) "
"while over noise threshold ({:0.2f}% > "
"{:0.2f}%)",
timeout,
m_cuda_noise * 100,
m_noise_tracker.back() * 100,
m_max_noise * 100));
}
if (m_total_samples < m_min_samples)

View File

@@ -28,6 +28,7 @@
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
#include <nvbench/detail/l2flush.cuh>
#include <nvbench/detail/ring_buffer.cuh>
#include <nvbench/detail/statistics.cuh>
#include <cuda_runtime.h>
@@ -58,19 +59,11 @@ protected:
struct kernel_launch_timer;
void check();
void initialize()
{
m_total_cuda_time = 0.;
m_total_cpu_time = 0.;
m_cuda_noise = 0.;
m_cpu_noise = 0.;
m_total_samples = 0;
m_cuda_times.clear();
m_cpu_times.clear();
m_max_time_exceeded = false;
}
void initialize();
void run_trials_prologue();
void record_measurements();
bool is_finished();
void run_trials_epilogue();
void generate_summaries();
void check_skip_time(nvbench::float64_t warmup_time);
@@ -86,7 +79,6 @@ protected:
}
void block_stream();
__forceinline__ void unblock_stream() { m_blocker.unblock(); }
nvbench::state &m_state;
@@ -94,7 +86,7 @@ protected:
nvbench::launch m_launch;
nvbench::cuda_timer m_cuda_timer;
nvbench::cpu_timer m_cpu_timer;
nvbench::cpu_timer m_timeout_timer;
nvbench::cpu_timer m_walltime_timer;
nvbench::detail::l2flush m_l2flush;
nvbench::blocking_kernel m_blocker;
@@ -110,8 +102,10 @@ protected:
nvbench::int64_t m_total_samples{};
nvbench::float64_t m_total_cuda_time{};
nvbench::float64_t m_total_cpu_time{};
nvbench::float64_t m_cuda_noise{}; // rel stdev
nvbench::float64_t m_cpu_noise{}; // rel stdev
nvbench::float64_t m_cpu_noise{}; // rel stdev
// Trailing history of noise measurements for convergence tests
nvbench::detail::ring_buffer<nvbench::float64_t> m_noise_tracker{512};
std::vector<nvbench::float64_t> m_cuda_times;
std::vector<nvbench::float64_t> m_cpu_times;
@@ -170,7 +164,11 @@ struct measure_cold : public measure_cold_base
this->check();
this->initialize();
this->run_warmup();
this->run_trials_prologue();
this->run_trials();
this->run_trials_epilogue();
this->generate_summaries();
}
@@ -192,47 +190,12 @@ private:
void run_trials()
{
m_timeout_timer.start();
kernel_launch_timer<use_blocking_kernel> timer(*this);
do
{
this->launch_kernel(timer);
const auto cur_cuda_time = m_cuda_timer.get_duration();
const auto cur_cpu_time = m_cpu_timer.get_duration();
m_cuda_times.push_back(cur_cuda_time);
m_cpu_times.push_back(cur_cpu_time);
m_total_cuda_time += cur_cuda_time;
m_total_cpu_time += cur_cpu_time;
++m_total_samples;
// Only consider the cuda noise in the convergence criteria.
m_cuda_noise = nvbench::detail::compute_noise(m_cuda_times,
m_total_cuda_time);
m_timeout_timer.stop();
const auto total_time = m_timeout_timer.get_duration();
if (m_run_once)
{
break;
}
if (m_total_cuda_time > m_min_time && // Min time okay
m_total_samples > m_min_samples && // Min samples okay
m_cuda_noise < m_max_noise) // Noise okay
{
break;
}
if (total_time > m_timeout) // Max time exceeded, stop iterating.
{
m_max_time_exceeded = true;
break;
}
} while (true);
m_cpu_noise = nvbench::detail::compute_noise(m_cpu_times, m_total_cpu_time);
this->record_measurements();
} while (!this->is_finished());
}
template <typename TimerT>

View File

@@ -0,0 +1,129 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nvbench/config.cuh>
#include <nvbench/detail/statistics.cuh>
#include <cassert>
#include <vector>
namespace nvbench::detail
{
/**
* @brief A simple, dynamically sized ring buffer.
*/
template <typename T>
struct ring_buffer
{
/**
* Create a new ring buffer with the requested capacity.
*/
explicit ring_buffer(std::size_t capacity)
: m_buffer(capacity)
{}
/**
* Iterators provide all values in the ring buffer in unspecified order.
* @{
*/
// clang-format off
[[nodiscard]] auto begin() { return m_buffer.begin(); }
[[nodiscard]] auto begin() const { return m_buffer.begin(); }
[[nodiscard]] auto cbegin() const { return m_buffer.cbegin(); }
[[nodiscard]] auto end() { return m_buffer.begin() + this->size(); }
[[nodiscard]] auto end() const { return m_buffer.begin() + this->size(); }
[[nodiscard]] auto cend() const { return m_buffer.cbegin() + this->size(); }
// clang-format on
/** @} */
/**
* The number of valid values in the ring buffer. Always <= capacity().
*/
[[nodiscard]] std::size_t size() const
{
return m_full ? m_buffer.size() : m_index;
}
/**
* The maximum size of the ring buffer.
*/
[[nodiscard]] std::size_t capacity() const
{
return m_buffer.size();
}
/**
* @return True if the ring buffer is empty.
*/
[[nodiscard]] bool empty() const { return m_index == 0 && !m_full; }
/**
* Remove all values from the buffer without modifying capacity.
*/
void clear()
{
m_index = 0;
m_full = false;
}
/**
* Add a new value to the ring buffer. If size() == capacity(), the oldest
* element in the buffer is overwritten.
*/
void push_back(T val)
{
assert(m_index < m_buffer.size());
m_buffer[m_index] = val;
m_index = (m_index + 1) % m_buffer.size();
if (m_index == 0)
{ // buffer wrapped
m_full = true;
}
}
/**
* Get the most recently added value.
* @{
*/
[[nodiscard]] auto back() const
{
assert(!this->empty());
const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1;
return m_buffer[back_index];
}
[[nodiscard]] auto back()
{
assert(!this->empty());
const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1;
return m_buffer[back_index];
}
/**@}*/
private:
std::vector<T> m_buffer;
std::size_t m_index{0};
bool m_full{false};
};
} // namespace nvbench::detail

View File

@@ -18,48 +18,47 @@
#pragma once
#include <nvbench/types.cuh>
#include <nvbench/detail/transform_reduce.cuh>
#include <cmath>
#include <functional>
#include <iterator>
#include <limits>
#include <numeric>
#include <vector>
#include <type_traits>
namespace nvbench::detail
namespace nvbench::detail::statistics
{
/**
* Given a vector of samples and the precomputed sum of all samples in the
* vector, return a measure of the noise in the samples.
* Computes and returns the unbiased sample standard deviation.
*
* The noise metric is the relative unbiased sample standard deviation
* (std_dev / mean).
* If the input has fewer than 5 sample, infinity is returned.
*/
inline nvbench::float64_t
compute_noise(const std::vector<nvbench::float64_t> &data,
nvbench::float64_t sum)
template <typename Iter,
typename ValueType = typename std::iterator_traits<Iter>::value_type>
ValueType standard_deviation(Iter first, Iter last, ValueType mean)
{
const auto num = static_cast<nvbench::float64_t>(data.size());
static_assert(std::is_floating_point_v<ValueType>);
const auto num = last - first;
if (num < 5) // don't bother with low sample sizes.
{
return std::numeric_limits<nvbench::float64_t>::infinity();
return std::numeric_limits<ValueType>::infinity();
}
const auto mean = sum / num;
const auto variance =
nvbench::detail::transform_reduce(data.cbegin(),
data.cend(),
0.,
std::plus<>{},
[mean](nvbench::float64_t val) {
val -= mean;
val *= val;
return val;
}) /
(num - 1);
const auto abs_stdev = std::sqrt(variance);
return abs_stdev / mean;
const auto variance = nvbench::detail::transform_reduce(first,
last,
ValueType{},
std::plus<>{},
[mean](auto val) {
val -= mean;
val *= val;
return val;
}) /
static_cast<ValueType>((num - 1));
return std::sqrt(variance);
}
} // namespace nvbench::detail
} // namespace nvbench::detail::statistics

View File

@@ -10,6 +10,7 @@ set(test_srcs
named_values.cu
option_parser.cu
range.cu
ring_buffer.cu
runner.cu
state.cu
state_generator.cu
@@ -36,3 +37,4 @@ foreach(test_src IN LISTS test_srcs)
endforeach()
add_subdirectory(cmake)
add_subdirectory(device)

90
testing/ring_buffer.cu Normal file
View File

@@ -0,0 +1,90 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/detail/ring_buffer.cuh>
#include "test_asserts.cuh"
#include <algorithm>
#include <vector>
template <typename T>
bool equal(const nvbench::detail::ring_buffer<T> &buffer,
const std::vector<T> &reference)
{
return std::equal(buffer.cbegin(), buffer.cend(), reference.cbegin());
}
int main()
try
{
nvbench::detail::ring_buffer<int> avg(3);
ASSERT(avg.capacity() == 3);
ASSERT(avg.size() == 0);
ASSERT(avg.empty());
ASSERT(equal(avg, {0, 0, 0}));
avg.push_back(32);
ASSERT(!avg.empty());
ASSERT(avg.size() == 1);
ASSERT(avg.capacity() == 3);
ASSERT_MSG(avg.back() == 32, " (got {})", avg.back());
ASSERT(equal(avg, {32, 0, 0}));
avg.push_back(2);
ASSERT(avg.size() == 2);
ASSERT(avg.capacity() == 3);
ASSERT_MSG(avg.back() == 2, " (got {})", avg.back());
ASSERT(equal(avg, {32, 2, 0}));
avg.push_back(-15);
ASSERT(avg.size() == 3);
ASSERT(avg.capacity() == 3);
ASSERT_MSG(avg.back() == -15, " (got {})", avg.back());
ASSERT(equal(avg, {32, 2, -15}));
avg.push_back(5);
ASSERT(avg.size() == 3);
ASSERT(avg.capacity() == 3);
ASSERT_MSG(avg.back() == 5, " (got {})", avg.back());
ASSERT(equal(avg, {5, 2, -15}));
avg.push_back(0);
ASSERT(avg.size() == 3);
ASSERT(avg.capacity() == 3);
ASSERT(equal(avg, {5, 0, -15}));
ASSERT_MSG(avg.back() == 0, " (got {})", avg.back());
avg.push_back(128);
ASSERT(avg.size() == 3);
ASSERT(avg.capacity() == 3);
ASSERT(equal(avg, {5, 0, 128}));
ASSERT_MSG(avg.back() == 128, " (got {})", avg.back());
avg.clear();
ASSERT(avg.empty());
ASSERT(avg.size() == 0);
ASSERT(avg.capacity() == 3);
return 0;
}
catch (std::exception &err)
{
fmt::print(stderr, "{}", err.what());
return 1;
}