Implement new convergence check for noisy kernels.

Previously, convergence was tested by waiting for the relative stdev of cuda timings ("noise") to drop below a certain percentage (`max_noise`). This assumed that all benchmarks would eventually see their noise drop to some threshold, but this is not the case. In practice, many benchmarks never converge to the default 0.5% relative stdev and instead will always run to the 15s timeout -- even if the means have converged in a second or two. Added a new check that tests when the noise itself stabilizes and ends the benchmark, even if noise > max_noise. After testing, this patch alone significantly reduces the runtime of the Thrust+CUB benchmark suite (from 30 hours to 5 hours) and produces similar timing results. The parameters used to tune this feature are not exposed -- if this approach works long-term and there's a strong motivation to let users tweak them, then we can worry about names/APIs/CLI/docs later.
2026-03-14 20:27:24 +00:00 · 2021-12-21 21:16:17 -05:00
parent 8e56a7bd94
commit 178dd0eb68
6 changed files with 382 additions and 87 deletions
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -10,6 +10,7 @@ set(test_srcs
  named_values.cu
  option_parser.cu
  range.cu
+  ring_buffer.cu
  runner.cu
  state.cu
  state_generator.cu
@@ -36,3 +37,4 @@ foreach(test_src IN LISTS test_srcs)
 endforeach()

 add_subdirectory(cmake)
+add_subdirectory(device)
--- a/testing/ring_buffer.cu
+++ b/testing/ring_buffer.cu
@@ -0,0 +1,90 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/ring_buffer.cuh>
+
+#include "test_asserts.cuh"
+
+#include <algorithm>
+#include <vector>
+
+template <typename T>
+bool equal(const nvbench::detail::ring_buffer<T> &buffer,
+           const std::vector<T> &reference)
+{
+  return std::equal(buffer.cbegin(), buffer.cend(), reference.cbegin());
+}
+
+int main()
+try
+{
+  nvbench::detail::ring_buffer<int> avg(3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(avg.size() == 0);
+  ASSERT(avg.empty());
+  ASSERT(equal(avg, {0, 0, 0}));
+
+  avg.push_back(32);
+  ASSERT(!avg.empty());
+  ASSERT(avg.size() == 1);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 32, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 0, 0}));
+
+  avg.push_back(2);
+  ASSERT(avg.size() == 2);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 2, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 2, 0}));
+
+  avg.push_back(-15);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == -15, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 2, -15}));
+
+  avg.push_back(5);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 5, " (got {})", avg.back());
+  ASSERT(equal(avg, {5, 2, -15}));
+
+  avg.push_back(0);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(equal(avg, {5, 0, -15}));
+  ASSERT_MSG(avg.back() == 0, " (got {})", avg.back());
+
+  avg.push_back(128);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(equal(avg, {5, 0, 128}));
+  ASSERT_MSG(avg.back() == 128, " (got {})", avg.back());
+
+  avg.clear();
+  ASSERT(avg.empty());
+  ASSERT(avg.size() == 0);
+  ASSERT(avg.capacity() == 3);
+
+  return 0;
+}
+catch (std::exception &err)
+{
+  fmt::print(stderr, "{}", err.what());
+  return 1;
+}