Add statistics::compute_percentiles, use it in summaries of measure_cold

Percentiles on empty dataset are NaN, not infinity

Add Robust statistics of CPU times to summary

Fixed name for nv/cold/time/gpu/q3, corrected value reported for
nv/cold/time/gpu/ir/relative

Use median and IR to compute location and noise in measure_cold

Also in stdrel_criterion, compute noise as IR / median.
This commit is contained in:
Oleksandr Pavlyk
2026-05-01 15:23:26 -05:00
parent e9daaba0f9
commit e292bb4eec
5 changed files with 264 additions and 12 deletions

View File

@@ -206,6 +206,7 @@ void measure_cold_base::generate_summaries()
summ.set_int64("value", m_total_samples);
}
// cpu time statistics
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/min");
summ.set_string("name", "Min CPU Time");
@@ -237,6 +238,7 @@ void measure_cold_base::generate_summaries()
summ.set_string("description",
"Mean isolated kernel execution time "
"(measured on host CPU)");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cpu_mean);
}
@@ -247,7 +249,7 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/absolute");
summ.set_string("name", "Noise");
summ.set_string("hint", "duration");
summ.set_string("description", "Standard deviation of isolated CPU times");
summ.set_string("description", "Standard deviation of isolated kernel execution CPU times");
summ.set_float64("value", cpu_stdev);
summ.set_string("hide", "Hidden by default.");
}
@@ -257,10 +259,60 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description", "Relative standard deviation of isolated CPU times");
summ.set_string("description",
"Relative standard deviation of isolated kernel execution CPU times");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cpu_noise);
}
const auto [cpu_time_first_quartile, cpu_time_median, cpu_time_third_quartile] =
nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(),
m_cpu_times.cend(),
{25, 50, 75});
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/q1");
summ.set_string("name", "Q1");
summ.set_string("hint", "duration");
summ.set_string("description", "First quartile of isolated kernel execution CPU times");
summ.set_float64("value", cpu_time_first_quartile);
summ.set_string("hide", "Hidden by default.");
}
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/median");
summ.set_string("name", "CPU Time");
summ.set_string("hint", "duration");
summ.set_string("description", "Median of isolated kernel execution CPU times");
summ.set_float64("value", cpu_time_median);
}
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/q3");
summ.set_string("name", "Q3");
summ.set_string("hint", "duration");
summ.set_string("description", "Third quartile of isolated kernel execution CPU times");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cpu_time_third_quartile);
}
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/absolute");
summ.set_string("name", "IR");
summ.set_string("hint", "duration");
summ.set_string("description", "Interquartile range of isolated kernel execution CPU times");
summ.set_string("hide", "Hidden by default.");
const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile;
summ.set_float64("value", cpu_time_ir);
}
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Relative interquartile range of isolated kernel execution CPU times");
const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile;
const auto cpu_robust_noise = cpu_time_ir / cpu_time_median;
summ.set_float64("value", cpu_robust_noise);
}
// gpu time statistics
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/min");
summ.set_string("name", "Min GPU Time");
@@ -291,6 +343,7 @@ void measure_cold_base::generate_summaries()
summ.set_string("description",
"Mean isolated kernel execution time "
"(measured with CUDA events)");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cuda_mean);
}
@@ -301,7 +354,7 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/absolute");
summ.set_string("name", "Noise");
summ.set_string("hint", "duration");
summ.set_string("description", "Standard deviation of isolated GPU times");
summ.set_string("description", "Standard deviation of isolated kernel execution GPU times");
summ.set_float64("value", cuda_stdev);
summ.set_string("hide", "Hidden by default.");
}
@@ -311,10 +364,59 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description", "Relative standard deviation of isolated GPU times");
summ.set_string("description",
"Relative standard deviation of isolated kernel execution GPU times");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cuda_noise);
}
const auto [cuda_time_first_quartile, cuda_time_median, cuda_time_third_quartile] =
nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(),
m_cuda_times.cend(),
{25, 50, 75});
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/q1");
summ.set_string("name", "Q1");
summ.set_string("hint", "duration");
summ.set_string("description", "First quartile of isolated kernel execution GPU times");
summ.set_float64("value", cuda_time_first_quartile);
summ.set_string("hide", "Hidden by default.");
}
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/median");
summ.set_string("name", "GPU Time");
summ.set_string("hint", "duration");
summ.set_string("description", "Median of isolated kernel execution GPU times");
summ.set_float64("value", cuda_time_median);
}
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/q3");
summ.set_string("name", "Q3");
summ.set_string("hint", "duration");
summ.set_string("description", "Third quartile of isolated kernel execution GPU times");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cuda_time_third_quartile);
}
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/absolute");
summ.set_string("name", "IR");
summ.set_string("hint", "duration");
summ.set_string("description", "Interquartile range of isolated kernel execution GPU times");
summ.set_string("hide", "Hidden by default.");
const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile;
summ.set_float64("value", cuda_time_ir);
}
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Relative interquartile range of isolated kernel execution GPU times");
const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile;
const auto cuda_robust_noise = cuda_time_ir / cuda_time_median;
summ.set_float64("value", cuda_robust_noise);
}
if (const auto items = m_state.get_element_count(); items != 0)
{
auto &summ = m_state.add_summary("nv/cold/bw/item_rate");

View File

@@ -173,6 +173,56 @@ void measure_cpu_only_base::generate_summaries()
summ.set_float64("value", cpu_noise);
}
const auto [cpu_first_quartile, cpu_median, cpu_third_quartile] =
nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(),
m_cpu_times.cend(),
{25, 50, 75});
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q1");
summ.set_string("name", "Q1");
summ.set_string("hint", "duration");
summ.set_string("description", "First quartile of CPU times of isolated kernel executions");
summ.set_float64("value", cpu_first_quartile);
summ.set_string("hide", "Hidden by default.");
}
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/median");
summ.set_string("name", "Median");
summ.set_string("hint", "duration");
summ.set_string("description", "Median of CPU times of isolated kernel executions");
summ.set_float64("value", cpu_median);
summ.set_string("hide", "Hidden by default.");
}
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q3");
summ.set_string("name", "Q3");
summ.set_string("hint", "duration");
summ.set_string("description", "Third quartile of CPU times of isolated kernel executions");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cpu_third_quartile);
}
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/absolute");
summ.set_string("name", "IR");
summ.set_string("hint", "duration");
summ.set_string("description",
"Interquartile range of CPU times of isolated kernel executions");
summ.set_string("hide", "Hidden by default.");
const auto cpu_ir = cpu_third_quartile - cpu_first_quartile;
summ.set_float64("value", cpu_ir);
}
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/relative");
summ.set_string("name", "IR");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Relative interquartile range of CPU times of isolated kernel executions");
summ.set_string("hide", "Hidden by default.");
const auto cpu_ir = cpu_third_quartile - cpu_first_quartile;
const auto cpu_robust_noise = cpu_ir / cpu_median;
summ.set_float64("value", cpu_robust_noise);
}
if (const auto items = m_state.get_element_count(); items != 0)
{
auto &summ = m_state.add_summary("nv/cpu_only/bw/item_rate");

View File

@@ -31,12 +31,15 @@
#include <nvbench/detail/transform_reduce.cuh>
#include <nvbench/types.cuh>
#include <algorithm>
#include <array>
#include <cmath>
#include <functional>
#include <iterator>
#include <limits>
#include <numeric>
#include <type_traits>
#include <vector>
#ifndef M_PI
#define M_PI 3.14159265358979323846
@@ -93,6 +96,56 @@ nvbench::float64_t compute_mean(It first, It last)
return std::accumulate(first, last, 0.0) / static_cast<nvbench::float64_t>(num);
}
/**
* Computes exact percentile values using rank round(p / 100 * (S - 1)).
*
* The input range is copied before sorting, so const iterators are supported.
* If the input has fewer than 1 sample, all percentiles are returned as infinity.
*/
template <typename Iter,
std::size_t N,
typename ValueType = typename std::iterator_traits<Iter>::value_type>
std::array<ValueType, N> compute_percentiles(Iter first, Iter last, std::array<int, N> percentiles)
{
std::array<ValueType, N> result{};
const auto num = std::distance(first, last);
if (num < 1)
{
result.fill(std::numeric_limits<ValueType>::quiet_NaN());
return result;
}
std::vector<ValueType> sorted(first, last);
std::sort(sorted.begin(), sorted.end());
const auto max_rank = static_cast<nvbench::float64_t>(sorted.size() - 1);
for (std::size_t i = 0; i < N; ++i)
{
const auto clamped_percentile = std::clamp(percentiles[i], 0, 100);
const auto quantile = static_cast<nvbench::float64_t>(clamped_percentile) / 100.0;
const auto rank = static_cast<std::size_t>(std::round(quantile * max_rank));
result[i] = sorted[rank];
}
return result;
}
/**
* Overload that supports calls like `compute_percentiles(first, last, {25, 50, 75})`.
*/
template <typename Iter,
std::size_t N,
typename ValueType = typename std::iterator_traits<Iter>::value_type>
std::array<ValueType, N> compute_percentiles(Iter first, Iter last, const int (&percentiles)[N])
{
std::array<int, N> percentile_array{};
std::copy(std::begin(percentiles), std::end(percentiles), percentile_array.begin());
return compute_percentiles(first, last, percentile_array);
}
/**
* Computes linear regression and returns the slope and intercept
*

View File

@@ -42,14 +42,15 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement)
m_cuda_times.push_back(measurement);
// Compute convergence statistics using CUDA timings:
const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
m_cuda_times.cend(),
mean_cuda_time);
const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
if (std::isfinite(cuda_rel_stdev))
const auto [cuda_first_quartile, cuda_median, cuda_third_quartile] =
nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(),
m_cuda_times.cend(),
{25, 50, 75});
const auto cuda_noise = (cuda_third_quartile - cuda_first_quartile) / cuda_median;
if (std::isfinite(cuda_noise))
{
m_noise_tracker.push_back(cuda_rel_stdev);
m_noise_tracker.push_back(cuda_noise);
}
}
@@ -71,7 +72,7 @@ bool stdrel_criterion::do_is_finished()
return true;
}
// Check if the noise (cuda rel stdev) has converged by inspecting a
// Check if the noise has converged by inspecting a
// trailing window of recorded noise measurements.
// This helps identify benchmarks that are inherently noisy and would
// never converge to the target stdev threshold. This check ensures that the

View File

@@ -20,6 +20,7 @@
#include <nvbench/types.cuh>
#include <algorithm>
#include <array>
#include <vector>
#include "test_asserts.cuh"
@@ -52,6 +53,50 @@ void test_std()
ASSERT(std::abs(actual - expected) < 0.001);
}
void test_percentiles()
{
{
const std::vector<nvbench::float64_t> data{40.0, 10.0, 30.0, 20.0};
const auto actual = statistics::compute_percentiles(data.cbegin(),
data.cend(),
std::array<int, 5>{0, 25, 50, 75, 100});
const std::array<nvbench::float64_t, 5> expected{10.0, 20.0, 30.0, 30.0, 40.0};
ASSERT(actual == expected);
}
{
const std::vector<nvbench::float64_t> data{42.0};
const auto actual =
statistics::compute_percentiles(data.cbegin(), data.cend(), std::array<int, 3>{25, 50, 75});
const std::array<nvbench::float64_t, 3> expected{42.0, 42.0, 42.0};
ASSERT(actual == expected);
}
{
const std::vector<nvbench::float64_t> data{40.0, 10.0, 30.0, 20.0};
const auto actual = statistics::compute_percentiles(data.cbegin(), data.cend(), {25, 50, 75});
const std::array<nvbench::float64_t, 3> expected{20.0, 30.0, 30.0};
ASSERT(actual == expected);
}
{
const std::vector<nvbench::float64_t> data{10.0, 20.0, 30.0, 40.0};
const auto actual =
statistics::compute_percentiles(data.cbegin(), data.cend(), std::array<int, 2>{-25, 125});
const std::array<nvbench::float64_t, 2> expected{10.0, 40.0};
ASSERT(actual == expected);
}
{
const std::vector<nvbench::float64_t> data;
const auto actual =
statistics::compute_percentiles(data.cbegin(), data.cend(), std::array<int, 3>{25, 50, 75});
ASSERT(!std::isfinite(actual[0]));
ASSERT(!std::isfinite(actual[1]));
ASSERT(!std::isfinite(actual[2]));
}
}
void test_lin_regression()
{
{
@@ -126,6 +171,7 @@ int main()
{
test_mean();
test_std();
test_percentiles();
test_lin_regression();
test_r2();
test_slope_conversion();