From e292bb4eecb791c1e0d9df01c47ee1af4686d1ef Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 1 May 2026 15:23:26 -0500 Subject: [PATCH] Add statistics::compute_percentiles, use it in summaries of measure_cold Percentiles on empty dataset are NaN, not infinity Add Robust statistics of CPU times to summary Fixed name for nv/cold/time/gpu/q3, corrected value reported for nv/cold/time/gpu/ir/relative Use median and IR to compute location and noise in measure_cold Also in stdrel_criterion, compute noise as IR / median. --- nvbench/detail/measure_cold.cu | 110 +++++++++++++++++++++++++++- nvbench/detail/measure_cpu_only.cxx | 50 +++++++++++++ nvbench/detail/statistics.cuh | 53 ++++++++++++++ nvbench/detail/stdrel_criterion.cxx | 17 +++-- testing/statistics.cu | 46 ++++++++++++ 5 files changed, 264 insertions(+), 12 deletions(-) diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index dd24b87..b56f952 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -206,6 +206,7 @@ void measure_cold_base::generate_summaries() summ.set_int64("value", m_total_samples); } + // cpu time statistics { auto &summ = m_state.add_summary("nv/cold/time/cpu/min"); summ.set_string("name", "Min CPU Time"); @@ -237,6 +238,7 @@ void measure_cold_base::generate_summaries() summ.set_string("description", "Mean isolated kernel execution time " "(measured on host CPU)"); + summ.set_string("hide", "Hidden by default."); summ.set_float64("value", cpu_mean); } @@ -247,7 +249,7 @@ void measure_cold_base::generate_summaries() auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/absolute"); summ.set_string("name", "Noise"); summ.set_string("hint", "duration"); - summ.set_string("description", "Standard deviation of isolated CPU times"); + summ.set_string("description", "Standard deviation of isolated kernel execution CPU times"); summ.set_float64("value", cpu_stdev); summ.set_string("hide", "Hidden by default."); } @@ -257,10 +259,60 @@ void measure_cold_base::generate_summaries() auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative"); summ.set_string("name", "Noise"); summ.set_string("hint", "percentage"); - summ.set_string("description", "Relative standard deviation of isolated CPU times"); + summ.set_string("description", + "Relative standard deviation of isolated kernel execution CPU times"); + summ.set_string("hide", "Hidden by default."); summ.set_float64("value", cpu_noise); } + const auto [cpu_time_first_quartile, cpu_time_median, cpu_time_third_quartile] = + nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(), + m_cpu_times.cend(), + {25, 50, 75}); + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/q1"); + summ.set_string("name", "Q1"); + summ.set_string("hint", "duration"); + summ.set_string("description", "First quartile of isolated kernel execution CPU times"); + summ.set_float64("value", cpu_time_first_quartile); + summ.set_string("hide", "Hidden by default."); + } + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/median"); + summ.set_string("name", "CPU Time"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Median of isolated kernel execution CPU times"); + summ.set_float64("value", cpu_time_median); + } + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/q3"); + summ.set_string("name", "Q3"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Third quartile of isolated kernel execution CPU times"); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", cpu_time_third_quartile); + } + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/absolute"); + summ.set_string("name", "IR"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Interquartile range of isolated kernel execution CPU times"); + summ.set_string("hide", "Hidden by default."); + const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile; + summ.set_float64("value", cpu_time_ir); + } + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/relative"); + summ.set_string("name", "Noise"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Relative interquartile range of isolated kernel execution CPU times"); + const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile; + const auto cpu_robust_noise = cpu_time_ir / cpu_time_median; + summ.set_float64("value", cpu_robust_noise); + } + + // gpu time statistics { auto &summ = m_state.add_summary("nv/cold/time/gpu/min"); summ.set_string("name", "Min GPU Time"); @@ -291,6 +343,7 @@ void measure_cold_base::generate_summaries() summ.set_string("description", "Mean isolated kernel execution time " "(measured with CUDA events)"); + summ.set_string("hide", "Hidden by default."); summ.set_float64("value", cuda_mean); } @@ -301,7 +354,7 @@ void measure_cold_base::generate_summaries() auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/absolute"); summ.set_string("name", "Noise"); summ.set_string("hint", "duration"); - summ.set_string("description", "Standard deviation of isolated GPU times"); + summ.set_string("description", "Standard deviation of isolated kernel execution GPU times"); summ.set_float64("value", cuda_stdev); summ.set_string("hide", "Hidden by default."); } @@ -311,10 +364,59 @@ void measure_cold_base::generate_summaries() auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative"); summ.set_string("name", "Noise"); summ.set_string("hint", "percentage"); - summ.set_string("description", "Relative standard deviation of isolated GPU times"); + summ.set_string("description", + "Relative standard deviation of isolated kernel execution GPU times"); + summ.set_string("hide", "Hidden by default."); summ.set_float64("value", cuda_noise); } + const auto [cuda_time_first_quartile, cuda_time_median, cuda_time_third_quartile] = + nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(), + m_cuda_times.cend(), + {25, 50, 75}); + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/q1"); + summ.set_string("name", "Q1"); + summ.set_string("hint", "duration"); + summ.set_string("description", "First quartile of isolated kernel execution GPU times"); + summ.set_float64("value", cuda_time_first_quartile); + summ.set_string("hide", "Hidden by default."); + } + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/median"); + summ.set_string("name", "GPU Time"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Median of isolated kernel execution GPU times"); + summ.set_float64("value", cuda_time_median); + } + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/q3"); + summ.set_string("name", "Q3"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Third quartile of isolated kernel execution GPU times"); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", cuda_time_third_quartile); + } + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/absolute"); + summ.set_string("name", "IR"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Interquartile range of isolated kernel execution GPU times"); + summ.set_string("hide", "Hidden by default."); + const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile; + summ.set_float64("value", cuda_time_ir); + } + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/relative"); + summ.set_string("name", "Noise"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Relative interquartile range of isolated kernel execution GPU times"); + const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile; + const auto cuda_robust_noise = cuda_time_ir / cuda_time_median; + summ.set_float64("value", cuda_robust_noise); + } + if (const auto items = m_state.get_element_count(); items != 0) { auto &summ = m_state.add_summary("nv/cold/bw/item_rate"); diff --git a/nvbench/detail/measure_cpu_only.cxx b/nvbench/detail/measure_cpu_only.cxx index 3cdda93..e5a0992 100644 --- a/nvbench/detail/measure_cpu_only.cxx +++ b/nvbench/detail/measure_cpu_only.cxx @@ -173,6 +173,56 @@ void measure_cpu_only_base::generate_summaries() summ.set_float64("value", cpu_noise); } + const auto [cpu_first_quartile, cpu_median, cpu_third_quartile] = + nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(), + m_cpu_times.cend(), + {25, 50, 75}); + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q1"); + summ.set_string("name", "Q1"); + summ.set_string("hint", "duration"); + summ.set_string("description", "First quartile of CPU times of isolated kernel executions"); + summ.set_float64("value", cpu_first_quartile); + summ.set_string("hide", "Hidden by default."); + } + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/median"); + summ.set_string("name", "Median"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Median of CPU times of isolated kernel executions"); + summ.set_float64("value", cpu_median); + summ.set_string("hide", "Hidden by default."); + } + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q3"); + summ.set_string("name", "Q3"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Third quartile of CPU times of isolated kernel executions"); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", cpu_third_quartile); + } + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/absolute"); + summ.set_string("name", "IR"); + summ.set_string("hint", "duration"); + summ.set_string("description", + "Interquartile range of CPU times of isolated kernel executions"); + summ.set_string("hide", "Hidden by default."); + const auto cpu_ir = cpu_third_quartile - cpu_first_quartile; + summ.set_float64("value", cpu_ir); + } + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/relative"); + summ.set_string("name", "IR"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Relative interquartile range of CPU times of isolated kernel executions"); + summ.set_string("hide", "Hidden by default."); + const auto cpu_ir = cpu_third_quartile - cpu_first_quartile; + const auto cpu_robust_noise = cpu_ir / cpu_median; + summ.set_float64("value", cpu_robust_noise); + } + if (const auto items = m_state.get_element_count(); items != 0) { auto &summ = m_state.add_summary("nv/cpu_only/bw/item_rate"); diff --git a/nvbench/detail/statistics.cuh b/nvbench/detail/statistics.cuh index 225403b..93920a8 100644 --- a/nvbench/detail/statistics.cuh +++ b/nvbench/detail/statistics.cuh @@ -31,12 +31,15 @@ #include #include +#include +#include #include #include #include #include #include #include +#include #ifndef M_PI #define M_PI 3.14159265358979323846 @@ -93,6 +96,56 @@ nvbench::float64_t compute_mean(It first, It last) return std::accumulate(first, last, 0.0) / static_cast(num); } +/** + * Computes exact percentile values using rank round(p / 100 * (S - 1)). + * + * The input range is copied before sorting, so const iterators are supported. + * If the input has fewer than 1 sample, all percentiles are returned as infinity. + */ +template ::value_type> +std::array compute_percentiles(Iter first, Iter last, std::array percentiles) +{ + std::array result{}; + + const auto num = std::distance(first, last); + if (num < 1) + { + result.fill(std::numeric_limits::quiet_NaN()); + return result; + } + + std::vector sorted(first, last); + std::sort(sorted.begin(), sorted.end()); + + const auto max_rank = static_cast(sorted.size() - 1); + for (std::size_t i = 0; i < N; ++i) + { + const auto clamped_percentile = std::clamp(percentiles[i], 0, 100); + + const auto quantile = static_cast(clamped_percentile) / 100.0; + const auto rank = static_cast(std::round(quantile * max_rank)); + + result[i] = sorted[rank]; + } + + return result; +} + +/** + * Overload that supports calls like `compute_percentiles(first, last, {25, 50, 75})`. + */ +template ::value_type> +std::array compute_percentiles(Iter first, Iter last, const int (&percentiles)[N]) +{ + std::array percentile_array{}; + std::copy(std::begin(percentiles), std::end(percentiles), percentile_array.begin()); + return compute_percentiles(first, last, percentile_array); +} + /** * Computes linear regression and returns the slope and intercept * diff --git a/nvbench/detail/stdrel_criterion.cxx b/nvbench/detail/stdrel_criterion.cxx index 38d7948..f03409f 100644 --- a/nvbench/detail/stdrel_criterion.cxx +++ b/nvbench/detail/stdrel_criterion.cxx @@ -42,14 +42,15 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement) m_cuda_times.push_back(measurement); // Compute convergence statistics using CUDA timings: - const auto mean_cuda_time = m_total_cuda_time / static_cast(m_total_samples); - const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(), - m_cuda_times.cend(), - mean_cuda_time); - const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time; - if (std::isfinite(cuda_rel_stdev)) + const auto [cuda_first_quartile, cuda_median, cuda_third_quartile] = + nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(), + m_cuda_times.cend(), + {25, 50, 75}); + const auto cuda_noise = (cuda_third_quartile - cuda_first_quartile) / cuda_median; + + if (std::isfinite(cuda_noise)) { - m_noise_tracker.push_back(cuda_rel_stdev); + m_noise_tracker.push_back(cuda_noise); } } @@ -71,7 +72,7 @@ bool stdrel_criterion::do_is_finished() return true; } - // Check if the noise (cuda rel stdev) has converged by inspecting a + // Check if the noise has converged by inspecting a // trailing window of recorded noise measurements. // This helps identify benchmarks that are inherently noisy and would // never converge to the target stdev threshold. This check ensures that the diff --git a/testing/statistics.cu b/testing/statistics.cu index db4a40d..ed49a5b 100644 --- a/testing/statistics.cu +++ b/testing/statistics.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include "test_asserts.cuh" @@ -52,6 +53,50 @@ void test_std() ASSERT(std::abs(actual - expected) < 0.001); } +void test_percentiles() +{ + { + const std::vector data{40.0, 10.0, 30.0, 20.0}; + const auto actual = statistics::compute_percentiles(data.cbegin(), + data.cend(), + std::array{0, 25, 50, 75, 100}); + const std::array expected{10.0, 20.0, 30.0, 30.0, 40.0}; + ASSERT(actual == expected); + } + + { + const std::vector data{42.0}; + const auto actual = + statistics::compute_percentiles(data.cbegin(), data.cend(), std::array{25, 50, 75}); + const std::array expected{42.0, 42.0, 42.0}; + ASSERT(actual == expected); + } + + { + const std::vector data{40.0, 10.0, 30.0, 20.0}; + const auto actual = statistics::compute_percentiles(data.cbegin(), data.cend(), {25, 50, 75}); + const std::array expected{20.0, 30.0, 30.0}; + ASSERT(actual == expected); + } + + { + const std::vector data{10.0, 20.0, 30.0, 40.0}; + const auto actual = + statistics::compute_percentiles(data.cbegin(), data.cend(), std::array{-25, 125}); + const std::array expected{10.0, 40.0}; + ASSERT(actual == expected); + } + + { + const std::vector data; + const auto actual = + statistics::compute_percentiles(data.cbegin(), data.cend(), std::array{25, 50, 75}); + ASSERT(!std::isfinite(actual[0])); + ASSERT(!std::isfinite(actual[1])); + ASSERT(!std::isfinite(actual[2])); + } +} + void test_lin_regression() { { @@ -126,6 +171,7 @@ int main() { test_mean(); test_std(); + test_percentiles(); test_lin_regression(); test_r2(); test_slope_conversion();