From a487a38895550b24c988f6255bda1828cb6d493e Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 22 Apr 2025 14:37:16 -0700 Subject: [PATCH 1/7] Dump frequencies --- nvbench/detail/measure_cold.cu | 3 ++ nvbench/detail/measure_cold.cuh | 1 + nvbench/json_printer.cu | 77 +++++++++++++++++++++++++++++++++ nvbench/json_printer.cuh | 1 + 4 files changed, 82 insertions(+) diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index d6f1dd4..e820110 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -86,6 +86,7 @@ void measure_cold_base::initialize() m_dynamic_throttle_recovery_delay = m_throttle_recovery_delay; m_throttle_discard_count = 0; + m_sm_clock_rates.clear(); m_cuda_times.clear(); m_cpu_times.clear(); @@ -140,6 +141,7 @@ void measure_cold_base::record_measurements() } m_throttle_discard_count = 0; + m_sm_clock_rates.push_back(current_clock_rate); m_sm_clock_rate_accumulator += current_clock_rate; } @@ -445,6 +447,7 @@ void measure_cold_base::generate_summaries() m_total_samples)); printer.process_bulk_data(m_state, "nv/cold/sample_times", "sample_times", m_cuda_times); + printer.process_bulk_data(m_state, "nv/cold/sample_freqs", "sample_freqs", m_sm_clock_rates); } } diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index f3fd80a..b93aae2 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -122,6 +122,7 @@ protected: nvbench::float64_t m_max_cpu_time{}; nvbench::float64_t m_total_cpu_time{}; + std::vector m_sm_clock_rates{}; nvbench::float64_t m_sm_clock_rate_accumulator{}; std::vector m_cuda_times; diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu index 085faae..b8e2a82 100644 --- a/nvbench/json_printer.cu +++ b/nvbench/json_printer.cu @@ -241,6 +241,83 @@ void json_printer::do_process_bulk_data_float64(state &state, fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000)); } } // end hint == sample_times + + if (hint == "sample_freqs") + { + nvbench::cpu_timer timer; + timer.start(); + + fs::path result_path{m_stream_name + "-freqs-bin/"}; + try + { + if (!fs::exists(result_path)) + { + if (!fs::create_directory(result_path)) + { + NVBENCH_THROW(std::runtime_error, "{}", "Failed to create result directory '{}'."); + } + } + else if (!fs::is_directory(result_path)) + { + NVBENCH_THROW(std::runtime_error, "{}", "'{}' exists and is not a directory."); + } + + const auto file_id = m_num_jsonbin_freq_files++; + result_path /= fmt::format("{:d}.bin", file_id); + + std::ofstream out; + out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit); + out.open(result_path, std::ios::binary | std::ios::out); + + // FIXME: SLOW -- Writing the binary file, 4 bytes at a time... + // There are a lot of optimizations that could be done here if this ends + // up being a noticeable bottleneck. + for (auto value64 : data) + { + const auto value32 = static_cast(value64); + char buffer[4]; + std::memcpy(buffer, &value32, 4); + // the c++17 implementation of is_little_endian isn't constexpr, but + // all supported compilers optimize this branch as if it were. + if (!is_little_endian()) + { + using std::swap; + swap(buffer[0], buffer[3]); + swap(buffer[1], buffer[2]); + } + out.write(buffer, 4); + } + } + catch (std::exception &e) + { + if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value()) + { + auto &printer = printer_opt_ref.value().get(); + printer.log( + nvbench::log_level::warn, + fmt::format("Error writing {} ({}) to {}: {}", tag, hint, result_path.string(), e.what())); + } + } // end catch + + auto &summ = state.add_summary(fmt::format("nv/json/freqs-bin:{}", tag)); + summ.set_string("name", "Samples Frequencies File"); + summ.set_string("hint", "file/sample_freqs"); + summ.set_string("description", + "Binary file containing sample frequencies as little-endian " + "float32."); + summ.set_string("filename", result_path.string()); + summ.set_int64("size", static_cast(data.size())); + summ.set_string("hide", "Not needed in table."); + + timer.stop(); + if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value()) + { + auto &printer = printer_opt_ref.value().get(); + printer.log( + nvbench::log_level::info, + fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000)); + } + } // end hint == sample_freqs } static void add_devices_section(nlohmann::ordered_json &root) diff --git a/nvbench/json_printer.cuh b/nvbench/json_printer.cuh index ae361ff..eb062b9 100644 --- a/nvbench/json_printer.cuh +++ b/nvbench/json_printer.cuh @@ -73,6 +73,7 @@ protected: bool m_enable_binary_output{false}; std::size_t m_num_jsonbin_files{}; + std::size_t m_num_jsonbin_freq_files{}; std::vector m_argv; }; From 40b2f4ece262cc2fd6322bf92fd0b55b092f8d75 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sat, 3 May 2025 19:54:39 -0700 Subject: [PATCH 2/7] Better place to stop freq timer? --- nvbench/detail/measure_cold.cuh | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index b93aae2..786e9b7 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -137,25 +137,33 @@ struct measure_cold_base::kernel_launch_timer : m_measure{measure} , m_disable_blocking_kernel{measure.m_disable_blocking_kernel} , m_run_once{measure.m_run_once} + , m_check_throttling{measure.m_check_throttling} {} explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel, - bool run_once) + bool run_once, + bool check_throttling) : m_measure{measure} , m_disable_blocking_kernel{disable_blocking_kernel} , m_run_once{run_once} + , m_check_throttling{check_throttling} {} __forceinline__ void start() { m_measure.flush_device_l2(); m_measure.sync_stream(); + + // start CPU timer irrespective of use of blocking kernel + // Ref: https://github.com/NVIDIA/nvbench/issues/249 + m_measure.m_cpu_timer.start(); + if (!m_disable_blocking_kernel) { m_measure.block_stream(); } - if (m_measure.m_check_throttling) + if (m_check_throttling) { m_measure.gpu_frequency_start(); } @@ -164,22 +172,19 @@ struct measure_cold_base::kernel_launch_timer m_measure.profiler_start(); } m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream()); - // start CPU timer irrespective of use of blocking kernel - // Ref: https://github.com/NVIDIA/nvbench/issues/249 - m_measure.m_cpu_timer.start(); } __forceinline__ void stop() { m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream()); + if (m_check_throttling) + { + m_measure.gpu_frequency_stop(); + } if (!m_disable_blocking_kernel) { m_measure.unblock_stream(); } - if (m_measure.m_check_throttling) - { - m_measure.gpu_frequency_stop(); - } m_measure.sync_stream(); if (m_run_once) { @@ -192,6 +197,7 @@ private: measure_cold_base &m_measure; bool m_disable_blocking_kernel; bool m_run_once; + bool m_check_throttling; }; template @@ -228,7 +234,7 @@ private: // disable use of blocking kernel for warm-up run // see https://github.com/NVIDIA/nvbench/issues/240 constexpr bool disable_blocking_kernel = true; - kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once); + kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once, m_check_throttling); this->launch_kernel(timer); this->check_skip_time(m_cuda_timer.get_duration()); @@ -239,7 +245,7 @@ private: // do not use blocking kernel if benchmark is only run once, e.g., when profiling // ref: https://github.com/NVIDIA/nvbench/issue/242 const bool disable_blocking_kernel = m_run_once || m_disable_blocking_kernel; - kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once); + kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once, m_check_throttling); do { this->launch_kernel(timer); From 988420b5b166ec7cc2088f778d0623186546ba97 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:19:06 -0600 Subject: [PATCH 3/7] Use write_out_values utility to save frequencies The utility was already used to save times --- nvbench/json_printer.cu | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu index b8e2a82..5cfc11a 100644 --- a/nvbench/json_printer.cu +++ b/nvbench/json_printer.cu @@ -106,7 +106,11 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values) } // end foreach value name } -template +// choose buffer to be block size of modern SSD +// see: https://github.com/NVIDIA/nvbench/issues/255 +static constexpr std::size_t preferred_buffer_nbytes = 4096; + +template void write_out_values(std::ofstream &out, const std::vector &data) { static constexpr std::size_t value_nbytes = sizeof(nvbench::float32_t); @@ -206,10 +210,7 @@ void json_printer::do_process_bulk_data_float64(state &state, out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit); out.open(result_path, std::ios::binary | std::ios::out); - // choose buffer to be block size of modern SSD - // see: https://github.com/NVIDIA/nvbench/issues/255 - constexpr std::size_t buffer_nbytes = 4096; - write_out_values(out, data); + write_out_values(out, data); } catch (std::exception &e) { @@ -269,24 +270,7 @@ void json_printer::do_process_bulk_data_float64(state &state, out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit); out.open(result_path, std::ios::binary | std::ios::out); - // FIXME: SLOW -- Writing the binary file, 4 bytes at a time... - // There are a lot of optimizations that could be done here if this ends - // up being a noticeable bottleneck. - for (auto value64 : data) - { - const auto value32 = static_cast(value64); - char buffer[4]; - std::memcpy(buffer, &value32, 4); - // the c++17 implementation of is_little_endian isn't constexpr, but - // all supported compilers optimize this branch as if it were. - if (!is_little_endian()) - { - using std::swap; - swap(buffer[0], buffer[3]); - swap(buffer[1], buffer[2]); - } - out.write(buffer, 4); - } + write_out_values(out, data); } catch (std::exception &e) { From 4da9f431c0caf6e476d908df2ea9efbcd2869c32 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 19 Feb 2026 15:32:00 -0600 Subject: [PATCH 4/7] Templatize write_out_values for different storage formats This could be used to save data as float32_t, or float64_t. This flexibility is useful for experimentation. --- nvbench/json_printer.cu | 53 +++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu index 5cfc11a..4e17359 100644 --- a/nvbench/json_printer.cu +++ b/nvbench/json_printer.cu @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -110,27 +111,45 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values) // see: https://github.com/NVIDIA/nvbench/issues/255 static constexpr std::size_t preferred_buffer_nbytes = 4096; -template -void write_out_values(std::ofstream &out, const std::vector &data) +template +void swap_bytes_impl(char *p, std::index_sequence) { - static constexpr std::size_t value_nbytes = sizeof(nvbench::float32_t); + ((std::swap(p[Is], p[N - 1 - Is])), ...); +} + +template +void big_endian_to_little_endian(char *word) +{ + if constexpr (WordSize > 1) + { + static_assert((WordSize & (WordSize - 1)) == 0, "WordSize must be a power of two"); + swap_bytes_impl(word, std::make_index_sequence{}); + } +} + +template +void write_out_values_as(std::ofstream &out, const std::vector &data) +{ + static_assert(std::is_floating_point_v); + static_assert(std::is_convertible_v); + + static constexpr std::size_t value_nbytes = sizeof(StorageT); static_assert(buffer_nbytes % value_nbytes == 0); - alignas(alignof(nvbench::float32_t)) char buffer[buffer_nbytes]; + alignas(alignof(StorageT)) char buffer[buffer_nbytes]; std::size_t bytes_in_buffer = 0; for (auto value64 : data) { - const auto value32 = static_cast(value64); + const auto value = static_cast(value64); auto value_subbuffer = &buffer[bytes_in_buffer]; - std::memcpy(value_subbuffer, &value32, value_nbytes); + std::memcpy(value_subbuffer, &value, value_nbytes); // the c++17 implementation of is_little_endian isn't constexpr, but // all supported compilers optimize this branch as if it were. if (!is_little_endian()) { - std::swap(value_subbuffer[0], value_subbuffer[3]); - std::swap(value_subbuffer[1], value_subbuffer[2]); + big_endian_to_little_endian(value_subbuffer); } bytes_in_buffer += value_nbytes; @@ -149,6 +168,20 @@ void write_out_values(std::ofstream &out, const std::vector } } +// save data using statically downcasting to float32 format +template +void write_out_values_as_float32(std::ofstream &out, const std::vector &data) +{ + write_out_values_as(out, data); +} + +// save data using float64 format +template +void write_out_values_as_float64(std::ofstream &out, const std::vector &data) +{ + write_out_values_as(out, data); +} + } // end namespace namespace nvbench @@ -210,7 +243,7 @@ void json_printer::do_process_bulk_data_float64(state &state, out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit); out.open(result_path, std::ios::binary | std::ios::out); - write_out_values(out, data); + write_out_values_as_float32(out, data); } catch (std::exception &e) { @@ -270,7 +303,7 @@ void json_printer::do_process_bulk_data_float64(state &state, out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit); out.open(result_path, std::ios::binary | std::ios::out); - write_out_values(out, data); + write_out_values_as_float32(out, data); } catch (std::exception &e) { From 731e0c2c3088e8a4370a2cf4d38a08e787a7bf99 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 19 Feb 2026 15:33:57 -0600 Subject: [PATCH 5/7] Swapped data members m_sm_clock_rates and m_sm_clock_rate_accumulator This places all std::vector members together. Added default initialization to all std::vector members, and all other members with default constructors. Exceptions are references and nvbench::launch m_launch; member --- nvbench/detail/measure_cold.cuh | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 786e9b7..3a372d1 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -85,27 +85,27 @@ protected: nvbench::state &m_state; nvbench::launch m_launch; - nvbench::cuda_timer m_cuda_timer; - nvbench::cpu_timer m_cpu_timer; - nvbench::cpu_timer m_walltime_timer; - nvbench::detail::l2flush m_l2flush; - nvbench::blocking_kernel m_blocker; + nvbench::cuda_timer m_cuda_timer{}; + nvbench::cpu_timer m_cpu_timer{}; + nvbench::cpu_timer m_walltime_timer{}; + nvbench::detail::l2flush m_l2flush{}; + nvbench::blocking_kernel m_blocker{}; - nvbench::criterion_params m_criterion_params; + nvbench::criterion_params m_criterion_params{}; nvbench::stopping_criterion_base &m_stopping_criterion; - nvbench::detail::gpu_frequency m_gpu_frequency; + nvbench::detail::gpu_frequency m_gpu_frequency{}; bool m_disable_blocking_kernel{false}; bool m_run_once{false}; - bool m_check_throttling; + bool m_check_throttling{true}; nvbench::int64_t m_min_samples{}; nvbench::float64_t m_skip_time{}; nvbench::float64_t m_timeout{}; - nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate] - nvbench::float32_t m_throttle_recovery_delay; // [seconds] + nvbench::float32_t m_throttle_threshold{}; // [% of default SM clock rate] + nvbench::float32_t m_throttle_recovery_delay{}; // [seconds] // Dynamically increased when repeated throttling occurs // without successfully recording a sample. @@ -122,13 +122,13 @@ protected: nvbench::float64_t m_max_cpu_time{}; nvbench::float64_t m_total_cpu_time{}; - std::vector m_sm_clock_rates{}; nvbench::float64_t m_sm_clock_rate_accumulator{}; + std::vector m_sm_clock_rates{}; - std::vector m_cuda_times; - std::vector m_cpu_times; + std::vector m_cuda_times{}; + std::vector m_cpu_times{}; - bool m_max_time_exceeded{}; + bool m_max_time_exceeded{false}; }; struct measure_cold_base::kernel_launch_timer From 998ab125ce8dcc6333eebe30d1c812ebdc6b9507 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 20 Feb 2026 16:34:53 -0600 Subject: [PATCH 6/7] Don't override m_check_throttling if throttling threshold is non-positive measure_cold class now directly inherits m_check_throttling from state. This ensures that when `--jsonbin` is specified frequency data corresponding to timing data are available to write out. --- nvbench/detail/measure_cold.cu | 2 +- nvbench/detail/measure_cold.cuh | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index e820110..fecff8b 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -44,7 +44,7 @@ measure_cold_base::measure_cold_base(state &exec_state) exec_state.get_stopping_criterion())} , m_disable_blocking_kernel{exec_state.get_disable_blocking_kernel()} , m_run_once{exec_state.get_run_once()} - , m_check_throttling(!exec_state.get_run_once() && exec_state.get_throttle_threshold() > 0.f) + , m_check_throttling(!exec_state.get_run_once()) , m_min_samples{exec_state.get_min_samples()} , m_skip_time{exec_state.get_skip_time()} , m_timeout{exec_state.get_timeout()} diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 3a372d1..edd24fe 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -140,6 +140,13 @@ struct measure_cold_base::kernel_launch_timer , m_check_throttling{measure.m_check_throttling} {} + explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel) + : m_measure{measure} + , m_disable_blocking_kernel{disable_blocking_kernel} + , m_run_once{measure.m_run_once} + , m_check_throttling{measure.m_check_throttling} + {} + explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel, bool run_once, @@ -234,7 +241,7 @@ private: // disable use of blocking kernel for warm-up run // see https://github.com/NVIDIA/nvbench/issues/240 constexpr bool disable_blocking_kernel = true; - kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once, m_check_throttling); + kernel_launch_timer timer(*this, disable_blocking_kernel); this->launch_kernel(timer); this->check_skip_time(m_cuda_timer.get_duration()); @@ -245,7 +252,7 @@ private: // do not use blocking kernel if benchmark is only run once, e.g., when profiling // ref: https://github.com/NVIDIA/nvbench/issue/242 const bool disable_blocking_kernel = m_run_once || m_disable_blocking_kernel; - kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once, m_check_throttling); + kernel_launch_timer timer(*this, disable_blocking_kernel); do { this->launch_kernel(timer); From c9705de4a4b4f4f50aef59b3e1a466da6579a0f6 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 27 Feb 2026 12:49:35 -0600 Subject: [PATCH 7/7] Reserve enough space clock-rates for min samples, if specified --- nvbench/detail/measure_cold.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index fecff8b..b6a4c34 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -53,8 +53,10 @@ measure_cold_base::measure_cold_base(state &exec_state) { if (m_min_samples > 0) { - m_cuda_times.reserve(static_cast(m_min_samples)); - m_cpu_times.reserve(static_cast(m_min_samples)); + const auto reserve_size = static_cast(m_min_samples); + m_sm_clock_rates.reserve(reserve_size); + m_cuda_times.reserve(reserve_size); + m_cpu_times.reserve(reserve_size); } }