diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 48af61b..c6bb939 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -19,12 +19,14 @@ #include #include -#include #include #include #include #include +#include +#include + #include #include @@ -43,7 +45,7 @@ measure_cold_base::measure_cold_base(state &exec_state) , m_min_time{exec_state.get_min_time()} , m_skip_time{exec_state.get_skip_time()} , m_timeout{exec_state.get_timeout()} -{ } +{} void measure_cold_base::check() { @@ -62,6 +64,113 @@ void measure_cold_base::check() } } +void measure_cold_base::initialize() +{ + m_total_cuda_time = 0.; + m_total_cpu_time = 0.; + m_cpu_noise = 0.; + m_total_samples = 0; + m_noise_tracker.clear(); + m_cuda_times.clear(); + m_cpu_times.clear(); + m_max_time_exceeded = false; +} + +void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); } + +void measure_cold_base::record_measurements() +{ + // Update and record timers and counters: + const auto cur_cuda_time = m_cuda_timer.get_duration(); + const auto cur_cpu_time = m_cpu_timer.get_duration(); + m_cuda_times.push_back(cur_cuda_time); + m_cpu_times.push_back(cur_cpu_time); + m_total_cuda_time += cur_cuda_time; + m_total_cpu_time += cur_cpu_time; + ++m_total_samples; + + // Compute convergence statistics using CUDA timings: + const auto mean_cuda_time = m_total_cuda_time / + static_cast(m_total_samples); + const auto cuda_stdev = + nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(), + m_cuda_times.cend(), + mean_cuda_time); + auto cuda_rel_stdev = cuda_stdev / mean_cuda_time; + if (std::isfinite(cuda_rel_stdev)) + { + m_noise_tracker.push_back(cuda_rel_stdev); + } +} + +bool measure_cold_base::is_finished() +{ + if (m_run_once) + { + return true; + } + + // Check that we've gathered enough samples: + if (m_total_cuda_time > m_min_time && m_total_samples > m_min_samples) + { + // Noise has dropped below threshold + if (m_noise_tracker.back() < m_max_noise) + { + return true; + } + + // Check if the noise (cuda rel stdev) has converged by inspecting a + // trailing window of recorded noise measurements. + // This helps identify benchmarks that are inherently noisy and would + // never converge to the target stdev threshold. This check ensures that the + // benchmark will end if the stdev stabilizes above the target threshold. + // Gather some iterations before checking noise, and limit how often we + // check this. + if (m_noise_tracker.size() > 64 && (m_total_samples % 16 == 0)) + { + // Use the current noise as the stdev reference. + const auto current_noise = m_noise_tracker.back(); + const auto noise_stdev = nvbench::detail::statistics::standard_deviation( + m_noise_tracker.cbegin(), + m_noise_tracker.cend(), + current_noise); + const auto noise_rel_stdev = noise_stdev / current_noise; + + // If the rel stdev of the last N cuda noise measurements is less than + // 5%, consider the result stable. + const auto noise_threshold = 0.05; + if (noise_rel_stdev < noise_threshold) + { + return true; + } + } + } + + // Check for timeouts: + m_walltime_timer.stop(); + if (m_walltime_timer.get_duration() > m_timeout) + { + m_max_time_exceeded = true; + return true; + } + + return false; +} + +void measure_cold_base::run_trials_epilogue() +{ + // Only need to compute this at the end, not per iteration. + const auto cpu_mean = m_total_cuda_time / + static_cast(m_total_samples); + const auto cpu_stdev = + nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(), + m_cpu_times.cend(), + m_total_cpu_time); + m_cpu_noise = cpu_stdev / cpu_mean; + + m_walltime_timer.stop(); +} + void measure_cold_base::generate_summaries() { const auto d_samples = static_cast(m_total_samples); @@ -113,7 +222,10 @@ void measure_cold_base::generate_summaries() summ.set_string("description", "Relative standard deviation of the cold GPU execution " "time measurements."); - summ.set_float64("value", m_cuda_noise); + summ.set_float64("value", + m_noise_tracker.empty() + ? std::numeric_limits::infinity() + : m_noise_tracker.back()); } if (const auto items = m_state.get_element_count(); items != 0) @@ -161,16 +273,16 @@ void measure_cold_base::generate_summaries() if (m_max_time_exceeded) { - const auto timeout = m_timeout_timer.get_duration(); + const auto timeout = m_walltime_timer.get_duration(); - if (m_cuda_noise > m_max_noise) + if (!m_noise_tracker.empty() && m_noise_tracker.back() > m_max_noise) { printer.log(nvbench::log_level::warn, fmt::format("Current measurement timed out ({:0.2f}s) " "while over noise threshold ({:0.2f}% > " "{:0.2f}%)", timeout, - m_cuda_noise * 100, + m_noise_tracker.back() * 100, m_max_noise * 100)); } if (m_total_samples < m_min_samples) diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 556d043..d4ba237 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -58,19 +59,11 @@ protected: struct kernel_launch_timer; void check(); - - void initialize() - { - m_total_cuda_time = 0.; - m_total_cpu_time = 0.; - m_cuda_noise = 0.; - m_cpu_noise = 0.; - m_total_samples = 0; - m_cuda_times.clear(); - m_cpu_times.clear(); - m_max_time_exceeded = false; - } - + void initialize(); + void run_trials_prologue(); + void record_measurements(); + bool is_finished(); + void run_trials_epilogue(); void generate_summaries(); void check_skip_time(nvbench::float64_t warmup_time); @@ -86,7 +79,6 @@ protected: } void block_stream(); - __forceinline__ void unblock_stream() { m_blocker.unblock(); } nvbench::state &m_state; @@ -94,7 +86,7 @@ protected: nvbench::launch m_launch; nvbench::cuda_timer m_cuda_timer; nvbench::cpu_timer m_cpu_timer; - nvbench::cpu_timer m_timeout_timer; + nvbench::cpu_timer m_walltime_timer; nvbench::detail::l2flush m_l2flush; nvbench::blocking_kernel m_blocker; @@ -110,8 +102,10 @@ protected: nvbench::int64_t m_total_samples{}; nvbench::float64_t m_total_cuda_time{}; nvbench::float64_t m_total_cpu_time{}; - nvbench::float64_t m_cuda_noise{}; // rel stdev - nvbench::float64_t m_cpu_noise{}; // rel stdev + nvbench::float64_t m_cpu_noise{}; // rel stdev + + // Trailing history of noise measurements for convergence tests + nvbench::detail::ring_buffer m_noise_tracker{512}; std::vector m_cuda_times; std::vector m_cpu_times; @@ -170,7 +164,11 @@ struct measure_cold : public measure_cold_base this->check(); this->initialize(); this->run_warmup(); + + this->run_trials_prologue(); this->run_trials(); + this->run_trials_epilogue(); + this->generate_summaries(); } @@ -192,47 +190,12 @@ private: void run_trials() { - m_timeout_timer.start(); kernel_launch_timer timer(*this); - do { this->launch_kernel(timer); - - const auto cur_cuda_time = m_cuda_timer.get_duration(); - const auto cur_cpu_time = m_cpu_timer.get_duration(); - m_cuda_times.push_back(cur_cuda_time); - m_cpu_times.push_back(cur_cpu_time); - m_total_cuda_time += cur_cuda_time; - m_total_cpu_time += cur_cpu_time; - ++m_total_samples; - - // Only consider the cuda noise in the convergence criteria. - m_cuda_noise = nvbench::detail::compute_noise(m_cuda_times, - m_total_cuda_time); - - m_timeout_timer.stop(); - const auto total_time = m_timeout_timer.get_duration(); - - if (m_run_once) - { - break; - } - - if (m_total_cuda_time > m_min_time && // Min time okay - m_total_samples > m_min_samples && // Min samples okay - m_cuda_noise < m_max_noise) // Noise okay - { - break; - } - - if (total_time > m_timeout) // Max time exceeded, stop iterating. - { - m_max_time_exceeded = true; - break; - } - } while (true); - m_cpu_noise = nvbench::detail::compute_noise(m_cpu_times, m_total_cpu_time); + this->record_measurements(); + } while (!this->is_finished()); } template diff --git a/nvbench/detail/ring_buffer.cuh b/nvbench/detail/ring_buffer.cuh new file mode 100644 index 0000000..fa86200 --- /dev/null +++ b/nvbench/detail/ring_buffer.cuh @@ -0,0 +1,129 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include +#include + +namespace nvbench::detail +{ + +/** + * @brief A simple, dynamically sized ring buffer. + */ +template +struct ring_buffer +{ + /** + * Create a new ring buffer with the requested capacity. + */ + explicit ring_buffer(std::size_t capacity) + : m_buffer(capacity) + {} + + /** + * Iterators provide all values in the ring buffer in unspecified order. + * @{ + */ + // clang-format off + [[nodiscard]] auto begin() { return m_buffer.begin(); } + [[nodiscard]] auto begin() const { return m_buffer.begin(); } + [[nodiscard]] auto cbegin() const { return m_buffer.cbegin(); } + [[nodiscard]] auto end() { return m_buffer.begin() + this->size(); } + [[nodiscard]] auto end() const { return m_buffer.begin() + this->size(); } + [[nodiscard]] auto cend() const { return m_buffer.cbegin() + this->size(); } + // clang-format on + /** @} */ + + /** + * The number of valid values in the ring buffer. Always <= capacity(). + */ + [[nodiscard]] std::size_t size() const + { + return m_full ? m_buffer.size() : m_index; + } + + /** + * The maximum size of the ring buffer. + */ + [[nodiscard]] std::size_t capacity() const + { + return m_buffer.size(); + } + + /** + * @return True if the ring buffer is empty. + */ + [[nodiscard]] bool empty() const { return m_index == 0 && !m_full; } + + /** + * Remove all values from the buffer without modifying capacity. + */ + void clear() + { + m_index = 0; + m_full = false; + } + + /** + * Add a new value to the ring buffer. If size() == capacity(), the oldest + * element in the buffer is overwritten. + */ + void push_back(T val) + { + assert(m_index < m_buffer.size()); + + m_buffer[m_index] = val; + + m_index = (m_index + 1) % m_buffer.size(); + if (m_index == 0) + { // buffer wrapped + m_full = true; + } + } + + /** + * Get the most recently added value. + * @{ + */ + [[nodiscard]] auto back() const + { + assert(!this->empty()); + const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1; + return m_buffer[back_index]; + } + [[nodiscard]] auto back() + { + assert(!this->empty()); + const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1; + return m_buffer[back_index]; + } + /**@}*/ + +private: + std::vector m_buffer; + std::size_t m_index{0}; + bool m_full{false}; +}; + +} // namespace nvbench::detail diff --git a/nvbench/detail/statistics.cuh b/nvbench/detail/statistics.cuh index 39c003e..957bca4 100644 --- a/nvbench/detail/statistics.cuh +++ b/nvbench/detail/statistics.cuh @@ -18,48 +18,47 @@ #pragma once +#include + #include #include #include +#include #include -#include -#include +#include -namespace nvbench::detail +namespace nvbench::detail::statistics { /** - * Given a vector of samples and the precomputed sum of all samples in the - * vector, return a measure of the noise in the samples. + * Computes and returns the unbiased sample standard deviation. * - * The noise metric is the relative unbiased sample standard deviation - * (std_dev / mean). + * If the input has fewer than 5 sample, infinity is returned. */ -inline nvbench::float64_t -compute_noise(const std::vector &data, - nvbench::float64_t sum) +template ::value_type> +ValueType standard_deviation(Iter first, Iter last, ValueType mean) { - const auto num = static_cast(data.size()); + static_assert(std::is_floating_point_v); + + const auto num = last - first; if (num < 5) // don't bother with low sample sizes. { - return std::numeric_limits::infinity(); + return std::numeric_limits::infinity(); } - const auto mean = sum / num; - const auto variance = - nvbench::detail::transform_reduce(data.cbegin(), - data.cend(), - 0., - std::plus<>{}, - [mean](nvbench::float64_t val) { - val -= mean; - val *= val; - return val; - }) / - (num - 1); - const auto abs_stdev = std::sqrt(variance); - return abs_stdev / mean; + const auto variance = nvbench::detail::transform_reduce(first, + last, + ValueType{}, + std::plus<>{}, + [mean](auto val) { + val -= mean; + val *= val; + return val; + }) / + static_cast((num - 1)); + return std::sqrt(variance); } -} // namespace nvbench::detail +} // namespace nvbench::detail::statistics diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index fe39688..4928ebc 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -10,6 +10,7 @@ set(test_srcs named_values.cu option_parser.cu range.cu + ring_buffer.cu runner.cu state.cu state_generator.cu @@ -36,3 +37,4 @@ foreach(test_src IN LISTS test_srcs) endforeach() add_subdirectory(cmake) +add_subdirectory(device) diff --git a/testing/ring_buffer.cu b/testing/ring_buffer.cu new file mode 100644 index 0000000..4e13805 --- /dev/null +++ b/testing/ring_buffer.cu @@ -0,0 +1,90 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "test_asserts.cuh" + +#include +#include + +template +bool equal(const nvbench::detail::ring_buffer &buffer, + const std::vector &reference) +{ + return std::equal(buffer.cbegin(), buffer.cend(), reference.cbegin()); +} + +int main() +try +{ + nvbench::detail::ring_buffer avg(3); + ASSERT(avg.capacity() == 3); + ASSERT(avg.size() == 0); + ASSERT(avg.empty()); + ASSERT(equal(avg, {0, 0, 0})); + + avg.push_back(32); + ASSERT(!avg.empty()); + ASSERT(avg.size() == 1); + ASSERT(avg.capacity() == 3); + ASSERT_MSG(avg.back() == 32, " (got {})", avg.back()); + ASSERT(equal(avg, {32, 0, 0})); + + avg.push_back(2); + ASSERT(avg.size() == 2); + ASSERT(avg.capacity() == 3); + ASSERT_MSG(avg.back() == 2, " (got {})", avg.back()); + ASSERT(equal(avg, {32, 2, 0})); + + avg.push_back(-15); + ASSERT(avg.size() == 3); + ASSERT(avg.capacity() == 3); + ASSERT_MSG(avg.back() == -15, " (got {})", avg.back()); + ASSERT(equal(avg, {32, 2, -15})); + + avg.push_back(5); + ASSERT(avg.size() == 3); + ASSERT(avg.capacity() == 3); + ASSERT_MSG(avg.back() == 5, " (got {})", avg.back()); + ASSERT(equal(avg, {5, 2, -15})); + + avg.push_back(0); + ASSERT(avg.size() == 3); + ASSERT(avg.capacity() == 3); + ASSERT(equal(avg, {5, 0, -15})); + ASSERT_MSG(avg.back() == 0, " (got {})", avg.back()); + + avg.push_back(128); + ASSERT(avg.size() == 3); + ASSERT(avg.capacity() == 3); + ASSERT(equal(avg, {5, 0, 128})); + ASSERT_MSG(avg.back() == 128, " (got {})", avg.back()); + + avg.clear(); + ASSERT(avg.empty()); + ASSERT(avg.size() == 0); + ASSERT(avg.capacity() == 3); + + return 0; +} +catch (std::exception &err) +{ + fmt::print(stderr, "{}", err.what()); + return 1; +}