mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-04-29 11:11:15 +00:00
Merge pull request #64 from allisonvacanti/noise_convergence
New convergence check
This commit is contained in:
@@ -33,6 +33,10 @@ option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
|
|||||||
option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ${cupti_default})
|
option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ${cupti_default})
|
||||||
|
|
||||||
option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
|
option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
|
||||||
|
option(NVBench_ENABLE_DEVICE_TESTING
|
||||||
|
"Include tests that require a GPU (with locked clocks)."
|
||||||
|
OFF
|
||||||
|
)
|
||||||
option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
|
option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
|
||||||
|
|
||||||
include(cmake/NVBenchConfigTarget.cmake)
|
include(cmake/NVBenchConfigTarget.cmake)
|
||||||
|
|||||||
@@ -19,12 +19,14 @@
|
|||||||
#include <nvbench/detail/measure_cold.cuh>
|
#include <nvbench/detail/measure_cold.cuh>
|
||||||
|
|
||||||
#include <nvbench/benchmark_base.cuh>
|
#include <nvbench/benchmark_base.cuh>
|
||||||
#include <nvbench/detail/throw.cuh>
|
|
||||||
#include <nvbench/device_info.cuh>
|
#include <nvbench/device_info.cuh>
|
||||||
#include <nvbench/printer_base.cuh>
|
#include <nvbench/printer_base.cuh>
|
||||||
#include <nvbench/state.cuh>
|
#include <nvbench/state.cuh>
|
||||||
#include <nvbench/summary.cuh>
|
#include <nvbench/summary.cuh>
|
||||||
|
|
||||||
|
#include <nvbench/detail/ring_buffer.cuh>
|
||||||
|
#include <nvbench/detail/throw.cuh>
|
||||||
|
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@@ -43,7 +45,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
|
|||||||
, m_min_time{exec_state.get_min_time()}
|
, m_min_time{exec_state.get_min_time()}
|
||||||
, m_skip_time{exec_state.get_skip_time()}
|
, m_skip_time{exec_state.get_skip_time()}
|
||||||
, m_timeout{exec_state.get_timeout()}
|
, m_timeout{exec_state.get_timeout()}
|
||||||
{ }
|
{}
|
||||||
|
|
||||||
void measure_cold_base::check()
|
void measure_cold_base::check()
|
||||||
{
|
{
|
||||||
@@ -62,6 +64,113 @@ void measure_cold_base::check()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void measure_cold_base::initialize()
|
||||||
|
{
|
||||||
|
m_total_cuda_time = 0.;
|
||||||
|
m_total_cpu_time = 0.;
|
||||||
|
m_cpu_noise = 0.;
|
||||||
|
m_total_samples = 0;
|
||||||
|
m_noise_tracker.clear();
|
||||||
|
m_cuda_times.clear();
|
||||||
|
m_cpu_times.clear();
|
||||||
|
m_max_time_exceeded = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
|
||||||
|
|
||||||
|
void measure_cold_base::record_measurements()
|
||||||
|
{
|
||||||
|
// Update and record timers and counters:
|
||||||
|
const auto cur_cuda_time = m_cuda_timer.get_duration();
|
||||||
|
const auto cur_cpu_time = m_cpu_timer.get_duration();
|
||||||
|
m_cuda_times.push_back(cur_cuda_time);
|
||||||
|
m_cpu_times.push_back(cur_cpu_time);
|
||||||
|
m_total_cuda_time += cur_cuda_time;
|
||||||
|
m_total_cpu_time += cur_cpu_time;
|
||||||
|
++m_total_samples;
|
||||||
|
|
||||||
|
// Compute convergence statistics using CUDA timings:
|
||||||
|
const auto mean_cuda_time = m_total_cuda_time /
|
||||||
|
static_cast<nvbench::float64_t>(m_total_samples);
|
||||||
|
const auto cuda_stdev =
|
||||||
|
nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
|
||||||
|
m_cuda_times.cend(),
|
||||||
|
mean_cuda_time);
|
||||||
|
auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
|
||||||
|
if (std::isfinite(cuda_rel_stdev))
|
||||||
|
{
|
||||||
|
m_noise_tracker.push_back(cuda_rel_stdev);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool measure_cold_base::is_finished()
|
||||||
|
{
|
||||||
|
if (m_run_once)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that we've gathered enough samples:
|
||||||
|
if (m_total_cuda_time > m_min_time && m_total_samples > m_min_samples)
|
||||||
|
{
|
||||||
|
// Noise has dropped below threshold
|
||||||
|
if (m_noise_tracker.back() < m_max_noise)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the noise (cuda rel stdev) has converged by inspecting a
|
||||||
|
// trailing window of recorded noise measurements.
|
||||||
|
// This helps identify benchmarks that are inherently noisy and would
|
||||||
|
// never converge to the target stdev threshold. This check ensures that the
|
||||||
|
// benchmark will end if the stdev stabilizes above the target threshold.
|
||||||
|
// Gather some iterations before checking noise, and limit how often we
|
||||||
|
// check this.
|
||||||
|
if (m_noise_tracker.size() > 64 && (m_total_samples % 16 == 0))
|
||||||
|
{
|
||||||
|
// Use the current noise as the stdev reference.
|
||||||
|
const auto current_noise = m_noise_tracker.back();
|
||||||
|
const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
|
||||||
|
m_noise_tracker.cbegin(),
|
||||||
|
m_noise_tracker.cend(),
|
||||||
|
current_noise);
|
||||||
|
const auto noise_rel_stdev = noise_stdev / current_noise;
|
||||||
|
|
||||||
|
// If the rel stdev of the last N cuda noise measurements is less than
|
||||||
|
// 5%, consider the result stable.
|
||||||
|
const auto noise_threshold = 0.05;
|
||||||
|
if (noise_rel_stdev < noise_threshold)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for timeouts:
|
||||||
|
m_walltime_timer.stop();
|
||||||
|
if (m_walltime_timer.get_duration() > m_timeout)
|
||||||
|
{
|
||||||
|
m_max_time_exceeded = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void measure_cold_base::run_trials_epilogue()
|
||||||
|
{
|
||||||
|
// Only need to compute this at the end, not per iteration.
|
||||||
|
const auto cpu_mean = m_total_cuda_time /
|
||||||
|
static_cast<nvbench::float64_t>(m_total_samples);
|
||||||
|
const auto cpu_stdev =
|
||||||
|
nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
|
||||||
|
m_cpu_times.cend(),
|
||||||
|
m_total_cpu_time);
|
||||||
|
m_cpu_noise = cpu_stdev / cpu_mean;
|
||||||
|
|
||||||
|
m_walltime_timer.stop();
|
||||||
|
}
|
||||||
|
|
||||||
void measure_cold_base::generate_summaries()
|
void measure_cold_base::generate_summaries()
|
||||||
{
|
{
|
||||||
const auto d_samples = static_cast<double>(m_total_samples);
|
const auto d_samples = static_cast<double>(m_total_samples);
|
||||||
@@ -113,7 +222,10 @@ void measure_cold_base::generate_summaries()
|
|||||||
summ.set_string("description",
|
summ.set_string("description",
|
||||||
"Relative standard deviation of the cold GPU execution "
|
"Relative standard deviation of the cold GPU execution "
|
||||||
"time measurements.");
|
"time measurements.");
|
||||||
summ.set_float64("value", m_cuda_noise);
|
summ.set_float64("value",
|
||||||
|
m_noise_tracker.empty()
|
||||||
|
? std::numeric_limits<nvbench::float64_t>::infinity()
|
||||||
|
: m_noise_tracker.back());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (const auto items = m_state.get_element_count(); items != 0)
|
if (const auto items = m_state.get_element_count(); items != 0)
|
||||||
@@ -161,16 +273,16 @@ void measure_cold_base::generate_summaries()
|
|||||||
|
|
||||||
if (m_max_time_exceeded)
|
if (m_max_time_exceeded)
|
||||||
{
|
{
|
||||||
const auto timeout = m_timeout_timer.get_duration();
|
const auto timeout = m_walltime_timer.get_duration();
|
||||||
|
|
||||||
if (m_cuda_noise > m_max_noise)
|
if (!m_noise_tracker.empty() && m_noise_tracker.back() > m_max_noise)
|
||||||
{
|
{
|
||||||
printer.log(nvbench::log_level::warn,
|
printer.log(nvbench::log_level::warn,
|
||||||
fmt::format("Current measurement timed out ({:0.2f}s) "
|
fmt::format("Current measurement timed out ({:0.2f}s) "
|
||||||
"while over noise threshold ({:0.2f}% > "
|
"while over noise threshold ({:0.2f}% > "
|
||||||
"{:0.2f}%)",
|
"{:0.2f}%)",
|
||||||
timeout,
|
timeout,
|
||||||
m_cuda_noise * 100,
|
m_noise_tracker.back() * 100,
|
||||||
m_max_noise * 100));
|
m_max_noise * 100));
|
||||||
}
|
}
|
||||||
if (m_total_samples < m_min_samples)
|
if (m_total_samples < m_min_samples)
|
||||||
|
|||||||
@@ -28,6 +28,7 @@
|
|||||||
|
|
||||||
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
|
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
|
||||||
#include <nvbench/detail/l2flush.cuh>
|
#include <nvbench/detail/l2flush.cuh>
|
||||||
|
#include <nvbench/detail/ring_buffer.cuh>
|
||||||
#include <nvbench/detail/statistics.cuh>
|
#include <nvbench/detail/statistics.cuh>
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
@@ -58,19 +59,11 @@ protected:
|
|||||||
struct kernel_launch_timer;
|
struct kernel_launch_timer;
|
||||||
|
|
||||||
void check();
|
void check();
|
||||||
|
void initialize();
|
||||||
void initialize()
|
void run_trials_prologue();
|
||||||
{
|
void record_measurements();
|
||||||
m_total_cuda_time = 0.;
|
bool is_finished();
|
||||||
m_total_cpu_time = 0.;
|
void run_trials_epilogue();
|
||||||
m_cuda_noise = 0.;
|
|
||||||
m_cpu_noise = 0.;
|
|
||||||
m_total_samples = 0;
|
|
||||||
m_cuda_times.clear();
|
|
||||||
m_cpu_times.clear();
|
|
||||||
m_max_time_exceeded = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void generate_summaries();
|
void generate_summaries();
|
||||||
|
|
||||||
void check_skip_time(nvbench::float64_t warmup_time);
|
void check_skip_time(nvbench::float64_t warmup_time);
|
||||||
@@ -86,7 +79,6 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void block_stream();
|
void block_stream();
|
||||||
|
|
||||||
__forceinline__ void unblock_stream() { m_blocker.unblock(); }
|
__forceinline__ void unblock_stream() { m_blocker.unblock(); }
|
||||||
|
|
||||||
nvbench::state &m_state;
|
nvbench::state &m_state;
|
||||||
@@ -94,7 +86,7 @@ protected:
|
|||||||
nvbench::launch m_launch;
|
nvbench::launch m_launch;
|
||||||
nvbench::cuda_timer m_cuda_timer;
|
nvbench::cuda_timer m_cuda_timer;
|
||||||
nvbench::cpu_timer m_cpu_timer;
|
nvbench::cpu_timer m_cpu_timer;
|
||||||
nvbench::cpu_timer m_timeout_timer;
|
nvbench::cpu_timer m_walltime_timer;
|
||||||
nvbench::detail::l2flush m_l2flush;
|
nvbench::detail::l2flush m_l2flush;
|
||||||
nvbench::blocking_kernel m_blocker;
|
nvbench::blocking_kernel m_blocker;
|
||||||
|
|
||||||
@@ -110,8 +102,10 @@ protected:
|
|||||||
nvbench::int64_t m_total_samples{};
|
nvbench::int64_t m_total_samples{};
|
||||||
nvbench::float64_t m_total_cuda_time{};
|
nvbench::float64_t m_total_cuda_time{};
|
||||||
nvbench::float64_t m_total_cpu_time{};
|
nvbench::float64_t m_total_cpu_time{};
|
||||||
nvbench::float64_t m_cuda_noise{}; // rel stdev
|
nvbench::float64_t m_cpu_noise{}; // rel stdev
|
||||||
nvbench::float64_t m_cpu_noise{}; // rel stdev
|
|
||||||
|
// Trailing history of noise measurements for convergence tests
|
||||||
|
nvbench::detail::ring_buffer<nvbench::float64_t> m_noise_tracker{512};
|
||||||
|
|
||||||
std::vector<nvbench::float64_t> m_cuda_times;
|
std::vector<nvbench::float64_t> m_cuda_times;
|
||||||
std::vector<nvbench::float64_t> m_cpu_times;
|
std::vector<nvbench::float64_t> m_cpu_times;
|
||||||
@@ -170,7 +164,11 @@ struct measure_cold : public measure_cold_base
|
|||||||
this->check();
|
this->check();
|
||||||
this->initialize();
|
this->initialize();
|
||||||
this->run_warmup();
|
this->run_warmup();
|
||||||
|
|
||||||
|
this->run_trials_prologue();
|
||||||
this->run_trials();
|
this->run_trials();
|
||||||
|
this->run_trials_epilogue();
|
||||||
|
|
||||||
this->generate_summaries();
|
this->generate_summaries();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -192,47 +190,12 @@ private:
|
|||||||
|
|
||||||
void run_trials()
|
void run_trials()
|
||||||
{
|
{
|
||||||
m_timeout_timer.start();
|
|
||||||
kernel_launch_timer<use_blocking_kernel> timer(*this);
|
kernel_launch_timer<use_blocking_kernel> timer(*this);
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
this->launch_kernel(timer);
|
this->launch_kernel(timer);
|
||||||
|
this->record_measurements();
|
||||||
const auto cur_cuda_time = m_cuda_timer.get_duration();
|
} while (!this->is_finished());
|
||||||
const auto cur_cpu_time = m_cpu_timer.get_duration();
|
|
||||||
m_cuda_times.push_back(cur_cuda_time);
|
|
||||||
m_cpu_times.push_back(cur_cpu_time);
|
|
||||||
m_total_cuda_time += cur_cuda_time;
|
|
||||||
m_total_cpu_time += cur_cpu_time;
|
|
||||||
++m_total_samples;
|
|
||||||
|
|
||||||
// Only consider the cuda noise in the convergence criteria.
|
|
||||||
m_cuda_noise = nvbench::detail::compute_noise(m_cuda_times,
|
|
||||||
m_total_cuda_time);
|
|
||||||
|
|
||||||
m_timeout_timer.stop();
|
|
||||||
const auto total_time = m_timeout_timer.get_duration();
|
|
||||||
|
|
||||||
if (m_run_once)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m_total_cuda_time > m_min_time && // Min time okay
|
|
||||||
m_total_samples > m_min_samples && // Min samples okay
|
|
||||||
m_cuda_noise < m_max_noise) // Noise okay
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (total_time > m_timeout) // Max time exceeded, stop iterating.
|
|
||||||
{
|
|
||||||
m_max_time_exceeded = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} while (true);
|
|
||||||
m_cpu_noise = nvbench::detail::compute_noise(m_cpu_times, m_total_cpu_time);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename TimerT>
|
template <typename TimerT>
|
||||||
|
|||||||
129
nvbench/detail/ring_buffer.cuh
Normal file
129
nvbench/detail/ring_buffer.cuh
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2021 NVIDIA Corporation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License.
|
||||||
|
*
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <nvbench/config.cuh>
|
||||||
|
|
||||||
|
#include <nvbench/detail/statistics.cuh>
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace nvbench::detail
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief A simple, dynamically sized ring buffer.
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
struct ring_buffer
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Create a new ring buffer with the requested capacity.
|
||||||
|
*/
|
||||||
|
explicit ring_buffer(std::size_t capacity)
|
||||||
|
: m_buffer(capacity)
|
||||||
|
{}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Iterators provide all values in the ring buffer in unspecified order.
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
// clang-format off
|
||||||
|
[[nodiscard]] auto begin() { return m_buffer.begin(); }
|
||||||
|
[[nodiscard]] auto begin() const { return m_buffer.begin(); }
|
||||||
|
[[nodiscard]] auto cbegin() const { return m_buffer.cbegin(); }
|
||||||
|
[[nodiscard]] auto end() { return m_buffer.begin() + this->size(); }
|
||||||
|
[[nodiscard]] auto end() const { return m_buffer.begin() + this->size(); }
|
||||||
|
[[nodiscard]] auto cend() const { return m_buffer.cbegin() + this->size(); }
|
||||||
|
// clang-format on
|
||||||
|
/** @} */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of valid values in the ring buffer. Always <= capacity().
|
||||||
|
*/
|
||||||
|
[[nodiscard]] std::size_t size() const
|
||||||
|
{
|
||||||
|
return m_full ? m_buffer.size() : m_index;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The maximum size of the ring buffer.
|
||||||
|
*/
|
||||||
|
[[nodiscard]] std::size_t capacity() const
|
||||||
|
{
|
||||||
|
return m_buffer.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return True if the ring buffer is empty.
|
||||||
|
*/
|
||||||
|
[[nodiscard]] bool empty() const { return m_index == 0 && !m_full; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove all values from the buffer without modifying capacity.
|
||||||
|
*/
|
||||||
|
void clear()
|
||||||
|
{
|
||||||
|
m_index = 0;
|
||||||
|
m_full = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a new value to the ring buffer. If size() == capacity(), the oldest
|
||||||
|
* element in the buffer is overwritten.
|
||||||
|
*/
|
||||||
|
void push_back(T val)
|
||||||
|
{
|
||||||
|
assert(m_index < m_buffer.size());
|
||||||
|
|
||||||
|
m_buffer[m_index] = val;
|
||||||
|
|
||||||
|
m_index = (m_index + 1) % m_buffer.size();
|
||||||
|
if (m_index == 0)
|
||||||
|
{ // buffer wrapped
|
||||||
|
m_full = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the most recently added value.
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
[[nodiscard]] auto back() const
|
||||||
|
{
|
||||||
|
assert(!this->empty());
|
||||||
|
const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1;
|
||||||
|
return m_buffer[back_index];
|
||||||
|
}
|
||||||
|
[[nodiscard]] auto back()
|
||||||
|
{
|
||||||
|
assert(!this->empty());
|
||||||
|
const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1;
|
||||||
|
return m_buffer[back_index];
|
||||||
|
}
|
||||||
|
/**@}*/
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<T> m_buffer;
|
||||||
|
std::size_t m_index{0};
|
||||||
|
bool m_full{false};
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace nvbench::detail
|
||||||
@@ -18,48 +18,47 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <nvbench/types.cuh>
|
||||||
|
|
||||||
#include <nvbench/detail/transform_reduce.cuh>
|
#include <nvbench/detail/transform_reduce.cuh>
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <iterator>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <numeric>
|
#include <type_traits>
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace nvbench::detail
|
namespace nvbench::detail::statistics
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a vector of samples and the precomputed sum of all samples in the
|
* Computes and returns the unbiased sample standard deviation.
|
||||||
* vector, return a measure of the noise in the samples.
|
|
||||||
*
|
*
|
||||||
* The noise metric is the relative unbiased sample standard deviation
|
* If the input has fewer than 5 sample, infinity is returned.
|
||||||
* (std_dev / mean).
|
|
||||||
*/
|
*/
|
||||||
inline nvbench::float64_t
|
template <typename Iter,
|
||||||
compute_noise(const std::vector<nvbench::float64_t> &data,
|
typename ValueType = typename std::iterator_traits<Iter>::value_type>
|
||||||
nvbench::float64_t sum)
|
ValueType standard_deviation(Iter first, Iter last, ValueType mean)
|
||||||
{
|
{
|
||||||
const auto num = static_cast<nvbench::float64_t>(data.size());
|
static_assert(std::is_floating_point_v<ValueType>);
|
||||||
|
|
||||||
|
const auto num = last - first;
|
||||||
if (num < 5) // don't bother with low sample sizes.
|
if (num < 5) // don't bother with low sample sizes.
|
||||||
{
|
{
|
||||||
return std::numeric_limits<nvbench::float64_t>::infinity();
|
return std::numeric_limits<ValueType>::infinity();
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto mean = sum / num;
|
const auto variance = nvbench::detail::transform_reduce(first,
|
||||||
const auto variance =
|
last,
|
||||||
nvbench::detail::transform_reduce(data.cbegin(),
|
ValueType{},
|
||||||
data.cend(),
|
std::plus<>{},
|
||||||
0.,
|
[mean](auto val) {
|
||||||
std::plus<>{},
|
val -= mean;
|
||||||
[mean](nvbench::float64_t val) {
|
val *= val;
|
||||||
val -= mean;
|
return val;
|
||||||
val *= val;
|
}) /
|
||||||
return val;
|
static_cast<ValueType>((num - 1));
|
||||||
}) /
|
return std::sqrt(variance);
|
||||||
(num - 1);
|
|
||||||
const auto abs_stdev = std::sqrt(variance);
|
|
||||||
return abs_stdev / mean;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace nvbench::detail
|
} // namespace nvbench::detail::statistics
|
||||||
|
|||||||
@@ -18,6 +18,8 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
// Many compilers still don't ship transform_reduce with their STLs, so here's
|
// Many compilers still don't ship transform_reduce with their STLs, so here's
|
||||||
// a naive implementation that will work everywhere. This is never used in a
|
// a naive implementation that will work everywhere. This is never used in a
|
||||||
// critical section, so perf isn't a concern.
|
// critical section, so perf isn't a concern.
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ const summary &state::get_summary(std::string_view name) const
|
|||||||
[&name](const auto &s) { return s.get_name() == name; });
|
[&name](const auto &s) { return s.get_name() == name; });
|
||||||
if (iter == m_summaries.cend())
|
if (iter == m_summaries.cend())
|
||||||
{
|
{
|
||||||
NVBENCH_THROW(std::runtime_error, "No summary named '{}'.", name);
|
NVBENCH_THROW(std::invalid_argument, "No summary named '{}'.", name);
|
||||||
}
|
}
|
||||||
return *iter;
|
return *iter;
|
||||||
}
|
}
|
||||||
@@ -140,7 +140,7 @@ summary &state::get_summary(std::string_view name)
|
|||||||
[&name](auto &s) { return s.get_name() == name; });
|
[&name](auto &s) { return s.get_name() == name; });
|
||||||
if (iter == m_summaries.end())
|
if (iter == m_summaries.end())
|
||||||
{
|
{
|
||||||
NVBENCH_THROW(std::runtime_error, "No summary named '{}'.", name);
|
NVBENCH_THROW(std::invalid_argument, "No summary named '{}'.", name);
|
||||||
}
|
}
|
||||||
return *iter;
|
return *iter;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -262,10 +262,13 @@ def main():
|
|||||||
to_compare = []
|
to_compare = []
|
||||||
if os.path.isdir(files_or_dirs[0]) and os.path.isdir(files_or_dirs[1]):
|
if os.path.isdir(files_or_dirs[0]) and os.path.isdir(files_or_dirs[1]):
|
||||||
for f in os.listdir(files_or_dirs[1]):
|
for f in os.listdir(files_or_dirs[1]):
|
||||||
|
if os.path.splitext(f)[1] != ".json":
|
||||||
|
continue
|
||||||
r = os.path.join(files_or_dirs[0], f)
|
r = os.path.join(files_or_dirs[0], f)
|
||||||
c = os.path.join(files_or_dirs[1], f)
|
c = os.path.join(files_or_dirs[1], f)
|
||||||
if os.path.isfile(r) and os.path.isfile(c):
|
if os.path.isfile(r) and os.path.isfile(c) and \
|
||||||
to_compare.append( (r,c) )
|
os.path.getsize(r) > 0 and os.path.getsize(c) > 0:
|
||||||
|
to_compare.append((r, c))
|
||||||
else:
|
else:
|
||||||
to_compare = [(files_or_dirs[0],files_or_dirs[1])]
|
to_compare = [(files_or_dirs[0],files_or_dirs[1])]
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ set(test_srcs
|
|||||||
named_values.cu
|
named_values.cu
|
||||||
option_parser.cu
|
option_parser.cu
|
||||||
range.cu
|
range.cu
|
||||||
|
ring_buffer.cu
|
||||||
runner.cu
|
runner.cu
|
||||||
state.cu
|
state.cu
|
||||||
state_generator.cu
|
state_generator.cu
|
||||||
@@ -36,3 +37,4 @@ foreach(test_src IN LISTS test_srcs)
|
|||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
add_subdirectory(cmake)
|
add_subdirectory(cmake)
|
||||||
|
add_subdirectory(device)
|
||||||
|
|||||||
14
testing/device/CMakeLists.txt
Normal file
14
testing/device/CMakeLists.txt
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# Test that we're converging to an accurate mean + stdev without timing out:
|
||||||
|
set(test_name nvbench.test.device.noisy_bench)
|
||||||
|
add_executable(${test_name} noisy_bench.cu)
|
||||||
|
target_link_libraries(${test_name} PRIVATE nvbench::main fmt)
|
||||||
|
nvbench_config_target(${test_name})
|
||||||
|
add_dependencies(nvbench.test.all ${test_name})
|
||||||
|
|
||||||
|
if (NVBench_ENABLE_DEVICE_TESTING)
|
||||||
|
add_test(NAME ${test_name} COMMAND "$<TARGET_FILE:${test_name}>")
|
||||||
|
set_tests_properties(${test_name} PROPERTIES
|
||||||
|
# Any timeouts/warnings are hard failures for this test.
|
||||||
|
FAIL_REGULAR_EXPRESSION "Warn;timed out"
|
||||||
|
)
|
||||||
|
endif()
|
||||||
146
testing/device/noisy_bench.cu
Normal file
146
testing/device/noisy_bench.cu
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2021 NVIDIA Corporation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License.
|
||||||
|
*
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <nvbench/nvbench.cuh>
|
||||||
|
#include <nvbench/test_kernels.cuh>
|
||||||
|
|
||||||
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <limits>
|
||||||
|
#include <random>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
void noisy_bench(nvbench::state &state)
|
||||||
|
{
|
||||||
|
// time, convert ms -> s
|
||||||
|
const auto mean = static_cast<nvbench::float32_t>(state.get_float64("Mean")) /
|
||||||
|
1000.f;
|
||||||
|
// rel stdev
|
||||||
|
const auto noise_pct =
|
||||||
|
static_cast<nvbench::float32_t>(state.get_float64("Noise"));
|
||||||
|
const auto noise = noise_pct / 100.f;
|
||||||
|
// abs stdev
|
||||||
|
const auto stdev = noise * mean;
|
||||||
|
|
||||||
|
std::minstd_rand rng{};
|
||||||
|
std::normal_distribution<nvbench::float32_t> dist(mean, stdev);
|
||||||
|
|
||||||
|
// cold tag will save time by disabling batch measurements
|
||||||
|
state.exec(nvbench::exec_tag::impl::cold, [&](nvbench::launch &launch) {
|
||||||
|
const auto seconds = dist(rng);
|
||||||
|
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(seconds);
|
||||||
|
});
|
||||||
|
|
||||||
|
const auto measured_mean = static_cast<nvbench::float32_t>(
|
||||||
|
state.get_summary("Average GPU Time (Cold)").get_float64("value"));
|
||||||
|
const auto measured_noise = [&]() {
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return static_cast<nvbench::float32_t>(
|
||||||
|
state.get_summary("GPU Relative Standard Deviation (Cold)")
|
||||||
|
.get_float64("value"));
|
||||||
|
}
|
||||||
|
catch (std::invalid_argument &)
|
||||||
|
{
|
||||||
|
return std::numeric_limits<nvbench::float32_t>::infinity();
|
||||||
|
}
|
||||||
|
}();
|
||||||
|
const auto measured_stdev = measured_noise * measured_mean;
|
||||||
|
|
||||||
|
const auto mean_error = std::fabs(measured_mean - mean);
|
||||||
|
const auto stdev_error = std::fabs(measured_stdev - stdev);
|
||||||
|
const auto noise_error = std::fabs(measured_noise - noise);
|
||||||
|
|
||||||
|
const auto mean_threshold = std::max(0.025f * mean, 8e-6f); // 2.5% or 8us
|
||||||
|
const auto stdev_threshold = std::max(0.05f * stdev, 5e-6f); // 5% or 5us
|
||||||
|
|
||||||
|
const auto mean_pass = mean_error < mean_threshold;
|
||||||
|
const auto stdev_pass = stdev_error < stdev_threshold;
|
||||||
|
|
||||||
|
fmt::print("| {:^5} "
|
||||||
|
"| {:^12} | {:^12} "
|
||||||
|
"| {:^12} | {:^12} | {:^4} |\n",
|
||||||
|
"",
|
||||||
|
"Expected",
|
||||||
|
"Measured",
|
||||||
|
"Error",
|
||||||
|
"Threshold",
|
||||||
|
"Flag");
|
||||||
|
fmt::print("|{:-^7}"
|
||||||
|
"|{:-^14}|{:-^14}"
|
||||||
|
"|{:-^14}|{:-^14}|{:-^6}|\n",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"");
|
||||||
|
fmt::print("| Mean "
|
||||||
|
"| {:>9.6f} ms | {:>9.6f} ms "
|
||||||
|
"| {:>9.6f} ms | {:>9.6f} ms | {:4} |\n"
|
||||||
|
"| Stdev "
|
||||||
|
"| {:>9.6f} ms | {:>9.6f} ms "
|
||||||
|
"| {:>9.6f} ms | {:>9.6f} ms | {:4} |\n"
|
||||||
|
"| Noise "
|
||||||
|
"| {:>9.6f}% | {:>9.6f}% "
|
||||||
|
"| {:>9.6f}% | {:5} | {:4} |\n",
|
||||||
|
mean * 1000,
|
||||||
|
measured_mean * 1000,
|
||||||
|
mean_error * 1000,
|
||||||
|
mean_threshold * 1000,
|
||||||
|
mean_pass ? "" : "!!!!",
|
||||||
|
|
||||||
|
stdev * 1000,
|
||||||
|
measured_stdev * 1000,
|
||||||
|
stdev_error * 1000,
|
||||||
|
stdev_threshold * 1000,
|
||||||
|
stdev_pass ? "" : "!!!!",
|
||||||
|
|
||||||
|
noise * 100,
|
||||||
|
measured_noise * 100,
|
||||||
|
noise_error * 100,
|
||||||
|
"",
|
||||||
|
"");
|
||||||
|
|
||||||
|
if (!mean_pass)
|
||||||
|
{
|
||||||
|
// This isn't actually logged, it just tells ctest to mark the test as
|
||||||
|
// skipped as a soft-failure.
|
||||||
|
fmt::print("Warn: Mean error exceeds threshold: ({:.3} ms > {:.3} ms)\n",
|
||||||
|
mean_error * 1000,
|
||||||
|
mean_threshold * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!stdev_pass)
|
||||||
|
{
|
||||||
|
// This isn't actually logged, it just tells ctest to mark the test as
|
||||||
|
// skipped as a soft-failure.
|
||||||
|
fmt::print("Warn: Stdev error exceeds threshold: "
|
||||||
|
"({:.6} ms > {:.6} ms, noise: {:.3}%)\n",
|
||||||
|
stdev_error * 1000,
|
||||||
|
stdev_threshold * 1000,
|
||||||
|
measured_noise * 100);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NVBENCH_BENCH(noisy_bench)
|
||||||
|
.add_float64_axis("Mean", {0.05, 0.1, 0.5, 1.0, 10.0}) // ms
|
||||||
|
.add_float64_axis("Noise", {0.1, 5., 25.}) // %
|
||||||
|
// disable this; we want to test that the benchmarking loop will still exit
|
||||||
|
// when max_noise is never reached:
|
||||||
|
.set_max_noise(0.0000001);
|
||||||
90
testing/ring_buffer.cu
Normal file
90
testing/ring_buffer.cu
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2021 NVIDIA Corporation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License.
|
||||||
|
*
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <nvbench/detail/ring_buffer.cuh>
|
||||||
|
|
||||||
|
#include "test_asserts.cuh"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
bool equal(const nvbench::detail::ring_buffer<T> &buffer,
|
||||||
|
const std::vector<T> &reference)
|
||||||
|
{
|
||||||
|
return std::equal(buffer.cbegin(), buffer.cend(), reference.cbegin());
|
||||||
|
}
|
||||||
|
|
||||||
|
int main()
|
||||||
|
try
|
||||||
|
{
|
||||||
|
nvbench::detail::ring_buffer<int> avg(3);
|
||||||
|
ASSERT(avg.capacity() == 3);
|
||||||
|
ASSERT(avg.size() == 0);
|
||||||
|
ASSERT(avg.empty());
|
||||||
|
ASSERT(equal(avg, {0, 0, 0}));
|
||||||
|
|
||||||
|
avg.push_back(32);
|
||||||
|
ASSERT(!avg.empty());
|
||||||
|
ASSERT(avg.size() == 1);
|
||||||
|
ASSERT(avg.capacity() == 3);
|
||||||
|
ASSERT_MSG(avg.back() == 32, " (got {})", avg.back());
|
||||||
|
ASSERT(equal(avg, {32, 0, 0}));
|
||||||
|
|
||||||
|
avg.push_back(2);
|
||||||
|
ASSERT(avg.size() == 2);
|
||||||
|
ASSERT(avg.capacity() == 3);
|
||||||
|
ASSERT_MSG(avg.back() == 2, " (got {})", avg.back());
|
||||||
|
ASSERT(equal(avg, {32, 2, 0}));
|
||||||
|
|
||||||
|
avg.push_back(-15);
|
||||||
|
ASSERT(avg.size() == 3);
|
||||||
|
ASSERT(avg.capacity() == 3);
|
||||||
|
ASSERT_MSG(avg.back() == -15, " (got {})", avg.back());
|
||||||
|
ASSERT(equal(avg, {32, 2, -15}));
|
||||||
|
|
||||||
|
avg.push_back(5);
|
||||||
|
ASSERT(avg.size() == 3);
|
||||||
|
ASSERT(avg.capacity() == 3);
|
||||||
|
ASSERT_MSG(avg.back() == 5, " (got {})", avg.back());
|
||||||
|
ASSERT(equal(avg, {5, 2, -15}));
|
||||||
|
|
||||||
|
avg.push_back(0);
|
||||||
|
ASSERT(avg.size() == 3);
|
||||||
|
ASSERT(avg.capacity() == 3);
|
||||||
|
ASSERT(equal(avg, {5, 0, -15}));
|
||||||
|
ASSERT_MSG(avg.back() == 0, " (got {})", avg.back());
|
||||||
|
|
||||||
|
avg.push_back(128);
|
||||||
|
ASSERT(avg.size() == 3);
|
||||||
|
ASSERT(avg.capacity() == 3);
|
||||||
|
ASSERT(equal(avg, {5, 0, 128}));
|
||||||
|
ASSERT_MSG(avg.back() == 128, " (got {})", avg.back());
|
||||||
|
|
||||||
|
avg.clear();
|
||||||
|
ASSERT(avg.empty());
|
||||||
|
ASSERT(avg.size() == 0);
|
||||||
|
ASSERT(avg.capacity() == 3);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
catch (std::exception &err)
|
||||||
|
{
|
||||||
|
fmt::print(stderr, "{}", err.what());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user