From a6df59a9b5fcdd2e19e61b3ab3e2ba5f436f221d Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Fri, 4 Apr 2025 20:03:49 +0000 Subject: [PATCH] Add support for CPU-only benchmarking. Fixes #95. CPU-only mode is enabled by setting the `is_cpu_only` property while defining a benchmark, e.g. `NVBENCH_BENCH(foo).set_is_cpu_only(true)`. An optional `nvbench::exec_tag::no_gpu` hint can also be passed to `state.exec` to avoid instantiating GPU benchmarking backends. Note that a CUDA compiler and CUDA runtime are always required, even if all benchmarks in a translation unit are CPU-only. Similarly, a new `nvbench::exec_tag::gpu` hint can be used to avoid compiling CPU-only backends for GPU benchmarks. --- README.md | 9 +- docs/benchmarks.md | 105 ++++++++++-- examples/CMakeLists.txt | 1 + examples/cpu_only.cu | 83 ++++++++++ nvbench/CMakeLists.txt | 1 + nvbench/benchmark_base.cuh | 11 ++ nvbench/benchmark_base.cxx | 10 +- nvbench/benchmark_manager.cxx | 5 +- nvbench/detail/measure_cpu_only.cuh | 141 ++++++++++++++++ nvbench/detail/measure_cpu_only.cxx | 246 ++++++++++++++++++++++++++++ nvbench/detail/state_exec.cuh | 192 +++++++++++++++------- nvbench/exec_tag.cuh | 44 ++++- nvbench/markdown_printer.cu | 7 +- nvbench/option_parser.cu | 5 +- nvbench/state.cuh | 7 + nvbench/state.cxx | 2 + 16 files changed, 780 insertions(+), 89 deletions(-) create mode 100644 examples/cpu_only.cu create mode 100644 nvbench/detail/measure_cpu_only.cuh create mode 100644 nvbench/detail/measure_cpu_only.cxx diff --git a/README.md b/README.md index cbb29d1..4f975d9 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,9 @@ features: * Batch Measurements: * Executes the benchmark multiple times back-to-back and records total time. * Reports the average execution time (total time / number of executions). + * [CPU-only Measurements](docs/benchmarks.md#cpu-only-benchmarks) + * Measures the host-side execution time of a non-GPU benchmark. + * Not suitable for microbenchmarking. # Supported Compilers and Tools @@ -65,6 +68,7 @@ This repository provides a number of [examples](examples/) that demonstrate various NVBench features and usecases: - [Runtime and compile-time parameter sweeps](examples/axes.cu) +- [CPU-only benchmarking](examples/cpu_only.cu) - [Enums and compile-time-constant-integral parameter axes](examples/enums.cu) - [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu) - [Skipping benchmark configurations](examples/skip.cu) @@ -171,6 +175,7 @@ testing and parameter tuning of individual kernels. For in-depth analysis of end-to-end performance of multiple applications, the NVIDIA Nsight tools are more appropriate. -NVBench is focused on evaluating the performance of CUDA kernels and is not -optimized for CPU microbenchmarks. This may change in the future, but for now, +NVBench is focused on evaluating the performance of CUDA kernels. It also provides +CPU-only benchmarking facilities intended for non-trivial CPU workloads, but is +not optimized for CPU microbenchmarks. This may change in the future, but for now, consider using Google Benchmark for high resolution CPU benchmarks. diff --git a/docs/benchmarks.md b/docs/benchmarks.md index dfd7b07..6ec24e5 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -4,7 +4,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++: ```cpp void my_benchmark(nvbench::state& state) { - state.exec([](nvbench::launch& launch) { + state.exec([](nvbench::launch& launch) { my_kernel<<>>(); }); } @@ -97,7 +97,7 @@ void benchmark(nvbench::state& state) const auto num_inputs = state.get_int64("NumInputs"); thrust::device_vector data = generate_input(num_inputs); - state.exec([&data](nvbench::launch& launch) { + state.exec([&data](nvbench::launch& launch) { my_kernel<<>>(data.begin(), data.end()); }); } @@ -134,7 +134,7 @@ void benchmark(nvbench::state& state) const auto quality = state.get_float64("Quality"); state.exec([&quality](nvbench::launch& launch) - { + { my_kernel<<>>(quality); }); } @@ -153,7 +153,7 @@ void benchmark(nvbench::state& state) thrust::device_vector data = generate_input(rng_dist); state.exec([&data](nvbench::launch& launch) - { + { my_kernel<<>>(data.begin(), data.end()); }); } @@ -182,7 +182,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list) thrust::device_vector data = generate_input(); state.exec([&data](nvbench::launch& launch) - { + { my_kernel<<>>(data.begin(), data.end()); }); } @@ -266,7 +266,6 @@ In general:: More examples can found in [examples/throughput.cu](../examples/throughput.cu). - # Skip Uninteresting / Invalid Benchmarks Sometimes particular combinations of parameters aren't useful or interesting — @@ -294,7 +293,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list) // Skip benchmarks at compile time -- for example, always skip when T == U // (Note that the `type_list` argument defines the same type twice). template -void my_benchmark(nvbench::state& state, +void my_benchmark(nvbench::state& state, nvbench::type_list) { state.skip("T must not be the same type as U."); @@ -320,6 +319,15 @@ true: synchronize internally. - `nvbench::exec_tag::timer` requests a timer object that can be used to restrict the timed region. +- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables + them during execution to reduce runtime, and prevents their compilation to + reduce compile-time and binary size. +- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking + code from being compiled for a particular benchmark. A runtime error is emitted + if the benchmark is defined with `set_is_cpu_only(true)`. +- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking + code from being compiled for a particular benchmark. A runtime error is emitted + if the benchmark does not also define `set_is_cpu_only(true)`. Multiple execution tags may be combined using `operator|`, e.g. @@ -370,7 +378,7 @@ Note that using manual timer mode disables batch measurements. void timer_example(nvbench::state& state) { // Pass the `timer` exec tag to request a timer: - state.exec(nvbench::exec_tag::timer, + state.exec(nvbench::exec_tag::timer, // Lambda now accepts a timer: [](nvbench::launch& launch, auto& timer) { @@ -391,6 +399,79 @@ NVBENCH_BENCH(timer_example); See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete example. +## Compilation hints: `nvbench::exec_tag::no_batch`, `gpu`, and `no_gpu` + +These execution tags are optional hints that disable the compilation of various +code paths when they are not needed. They apply only to a single benchmark. + +- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend. +- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends. + - Requires that the benchmark does not define `set_is_cpu_only(true)`. + - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size. + - Host-side CPU measurements of GPU kernel execution time are still provided. +- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends. + - Requires that the benchmark defines `set_is_cpu_only(true)`. + - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size. + - See also [CPU-only Benchmarks](#cpu-only-benchmarks). + +# CPU-only Benchmarks + +NVBench provides CPU-only benchmarking facilities that are intended for measuring +significant CPU workloads. We do not recommend using these features for high-resolution +CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for +such applications. Examples are provided in [examples/cpu_only.cu](../examples/cpu_only.cu). + +Note that NVBench still requires a CUDA compiler and runtime even if a project only contains +CPU-only benchmarks. + +The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements: + +```cpp +void my_cpu_benchmark(nvbench::state &state) +{ + state.exec([](nvbench::launch &) { /* workload */ }); +} +NVBENCH_BENCH(my_cpu_benchmark) + .set_is_cpu_only(true); // Mark as CPU-only. +``` + +The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and +binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while +`is_cpu_only` is false. + +```cpp +void my_cpu_benchmark(nvbench::state &state) +{ + state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends + [](nvbench::launch &) { /* workload */ }); +} +NVBENCH_BENCH(my_cpu_benchmark) + .set_is_cpu_only(true); // Mark as CPU-only. +``` + +The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This +is useful for benchmarks that require additional per-sample setup/teardown. See the +[`nvbench::exec_tag::timer`](#explicit-timer-mode-nvbenchexec_tagtimer) section for more +details. + +```cpp +void my_cpu_benchmark(nvbench::state &state) +{ + state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends + nvbench::exec_tag::timer, // Request a timer object + [](nvbench::launch &, auto &timer) + { + // Setup here + timer.start(); + // timed workload + timer.stop(); + // teardown here + }); +} +NVBENCH_BENCH(my_cpu_benchmark) + .set_is_cpu_only(true); // Mark as CPU-only. +``` + # Beware: Combinatorial Explosion Is Lurking Be very careful of how quickly the configuration space can grow. The following @@ -403,7 +484,7 @@ using value_types = nvbench::type_list; -using op_types = nvbench::type_list, +using op_types = nvbench::type_list, thrust::multiplies<>, thrust::maximum<>>; @@ -418,7 +499,7 @@ NVBENCH_BENCH_TYPES(my_benchmark, ``` 960 total configs -= 4 [T=(U8, I32, F32, F64)] += 4 [T=(U8, I32, F32, F64)] * 4 [U=(U8, I32, F32, F64)] * 4 [V=(U8, I32, F32, F64)] * 3 [Op=(plus, multiplies, max)] @@ -427,8 +508,8 @@ NVBENCH_BENCH_TYPES(my_benchmark, For large configuration spaces like this, pruning some of the less useful combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques -described in the "Skip Uninteresting / Invalid Benchmarks" section can help -immensely with keeping compile / run times manageable. +described in the [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting--invalid-benchmarks) +section can help immensely with keeping compile / run times manageable. Splitting a single large configuration space into multiple, more focused benchmarks with reduced dimensionality will likely be worth the effort as well. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index dc6e52b..b3e635c 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -2,6 +2,7 @@ set(example_srcs auto_throughput.cu axes.cu custom_criterion.cu + cpu_only.cu enums.cu exec_tag_sync.cu exec_tag_timer.cu diff --git a/examples/cpu_only.cu b/examples/cpu_only.cu new file mode 100644 index 0000000..a7aa7a1 --- /dev/null +++ b/examples/cpu_only.cu @@ -0,0 +1,83 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +// Block execution of the current CPU thread for `seconds` seconds. +void sleep_host(double seconds) +{ + std::this_thread::sleep_for( + std::chrono::milliseconds(static_cast(seconds * 1000))); +} + +//============================================================================= +// Simple CPU-only benchmark that sleeps on host for a specified duration. +void simple(nvbench::state &state) +{ + const auto duration = state.get_float64("Duration"); + + state.exec([duration](nvbench::launch &) { sleep_host(duration); }); +} +NVBENCH_BENCH(simple) + // 100 -> 500 ms in 100 ms increments. + .add_float64_axis("Duration", nvbench::range(.1, .5, .1)) + // Mark as CPU-only. + .set_is_cpu_only(true); + +//============================================================================= +// Simple CPU-only benchmark that sleeps on host for a specified duration and +// uses a custom timed region. +void simple_timer(nvbench::state &state) +{ + const auto duration = state.get_float64("Duration"); + + state.exec(nvbench::exec_tag::timer, [duration](nvbench::launch &, auto &timer) { + // Do any setup work before starting the timer here... + timer.start(); + + // The region of code to be timed: + sleep_host(duration); + + timer.stop(); + // Any per-run cleanup here... + }); +} +NVBENCH_BENCH(simple_timer) + // 100 -> 500 ms in 100 ms increments. + .add_float64_axis("Duration", nvbench::range(.1, .5, .1)) + // Mark as CPU-only. + .set_is_cpu_only(true); + +//============================================================================= +// Simple CPU-only benchmark that uses the optional `nvbench::exec_tag::no_gpu` +// hint to prevent GPU measurement code from being instantiated. Note that +// `set_is_cpu_only(true)` is still required when using this hint. +void simple_no_gpu(nvbench::state &state) +{ + const auto duration = state.get_float64("Duration"); + + state.exec(nvbench::exec_tag::no_gpu, [duration](nvbench::launch &) { sleep_host(duration); }); +} +NVBENCH_BENCH(simple_no_gpu) + // 100 -> 500 ms in 100 ms increments. + .add_float64_axis("Duration", nvbench::range(.1, .5, .1)) + // Mark as CPU-only. + .set_is_cpu_only(true); diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt index 265fa99..33569b9 100644 --- a/nvbench/CMakeLists.txt +++ b/nvbench/CMakeLists.txt @@ -25,6 +25,7 @@ set(srcs detail/entropy_criterion.cxx detail/measure_cold.cu + detail/measure_cpu_only.cxx detail/measure_hot.cu detail/state_generator.cxx detail/stdrel_criterion.cxx diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index a695b14..a363a3b 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -159,6 +159,16 @@ struct benchmark_base } /// @} + /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed. + /// @{ + [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; } + benchmark_base &set_is_cpu_only(bool is_cpu_only) + { + m_is_cpu_only = is_cpu_only; + return *this; + } + /// @} + /// If true, the benchmark is only run once, skipping all warmup runs and only /// executing a single non-batched measurement. This is intended for use with /// external profiling tools. @{ @@ -263,6 +273,7 @@ protected: optional_ref m_printer; + bool m_is_cpu_only{false}; bool m_run_once{false}; bool m_disable_blocking_kernel{false}; diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx index 6e89fd3..3cdea6b 100644 --- a/nvbench/benchmark_base.cxx +++ b/nvbench/benchmark_base.cxx @@ -34,12 +34,18 @@ std::unique_ptr benchmark_base::clone() const result->m_axes = m_axes; result->m_devices = m_devices; - result->m_min_samples = m_min_samples; - result->m_criterion_params = m_criterion_params; + result->m_printer = m_printer; + + result->m_is_cpu_only = m_is_cpu_only; + result->m_run_once = m_run_once; + result->m_disable_blocking_kernel = m_disable_blocking_kernel; + + result->m_min_samples = m_min_samples; result->m_skip_time = m_skip_time; result->m_timeout = m_timeout; + result->m_criterion_params = m_criterion_params; result->m_stopping_criterion = m_stopping_criterion; return result; diff --git a/nvbench/benchmark_manager.cxx b/nvbench/benchmark_manager.cxx index 5df702d..4ff9fe4 100644 --- a/nvbench/benchmark_manager.cxx +++ b/nvbench/benchmark_manager.cxx @@ -40,7 +40,10 @@ void benchmark_manager::initialize() const auto& mgr = device_manager::get(); for (auto& bench : m_benchmarks) { - bench->set_devices(mgr.get_devices()); + if (!bench->get_is_cpu_only()) + { + bench->set_devices(mgr.get_devices()); + } } } diff --git a/nvbench/detail/measure_cpu_only.cuh b/nvbench/detail/measure_cpu_only.cuh new file mode 100644 index 0000000..803da70 --- /dev/null +++ b/nvbench/detail/measure_cpu_only.cuh @@ -0,0 +1,141 @@ +/* + * Copyright 2021-2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace nvbench +{ + +struct state; + +namespace detail +{ + +// non-templated code goes here: +struct measure_cpu_only_base +{ + explicit measure_cpu_only_base(nvbench::state &exec_state); + measure_cpu_only_base(const measure_cpu_only_base &) = delete; + measure_cpu_only_base(measure_cpu_only_base &&) = delete; + measure_cpu_only_base &operator=(const measure_cpu_only_base &) = delete; + measure_cpu_only_base &operator=(measure_cpu_only_base &&) = delete; + +protected: + void check(); + void initialize(); + void run_trials_prologue(); + void record_measurements(); + bool is_finished(); + void run_trials_epilogue(); + void generate_summaries(); + + void check_skip_time(nvbench::float64_t warmup_time); + + nvbench::state &m_state; + + // Required to satisfy the KernelLauncher interface: + nvbench::launch m_launch; + + nvbench::cpu_timer m_cpu_timer; + nvbench::cpu_timer m_walltime_timer; + + nvbench::criterion_params m_criterion_params; + nvbench::stopping_criterion_base& m_stopping_criterion; + + bool m_run_once{false}; + + nvbench::int64_t m_min_samples{}; + + nvbench::float64_t m_skip_time{}; + nvbench::float64_t m_timeout{}; + + nvbench::int64_t m_total_samples{}; + nvbench::float64_t m_total_cpu_time{}; + nvbench::float64_t m_cpu_noise{}; // rel stdev + + std::vector m_cpu_times; + + bool m_max_time_exceeded{}; +}; + +template +struct measure_cpu_only : public measure_cpu_only_base +{ + measure_cpu_only(nvbench::state &state, KernelLauncher &kernel_launcher) + : measure_cpu_only_base(state) + , m_kernel_launcher{kernel_launcher} + {} + + void operator()() + { + this->check(); + this->initialize(); + this->run_warmup(); + + this->run_trials_prologue(); + this->run_trials(); + this->run_trials_epilogue(); + + this->generate_summaries(); + } + +private: + // Run the kernel once, measuring the CPU time. If under skip_time, skip the + // measurement. + void run_warmup() + { + if (m_run_once) + { // Skip warmups + return; + } + + this->launch_kernel(m_cpu_timer); + this->check_skip_time(m_cpu_timer.get_duration()); + } + + void run_trials() + { + do + { + this->launch_kernel(m_cpu_timer); + this->record_measurements(); + } while (!this->is_finished()); + } + + template + __forceinline__ void launch_kernel(TimerT &timer) + { + m_kernel_launcher(m_launch, timer); + } + + KernelLauncher &m_kernel_launcher; +}; + +} // namespace detail +} // namespace nvbench diff --git a/nvbench/detail/measure_cpu_only.cxx b/nvbench/detail/measure_cpu_only.cxx new file mode 100644 index 0000000..e4da01d --- /dev/null +++ b/nvbench/detail/measure_cpu_only.cxx @@ -0,0 +1,246 @@ +/* + * Copyright 2021-2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace nvbench::detail +{ + +measure_cpu_only_base::measure_cpu_only_base(state &exec_state) + : m_state{exec_state} + , m_launch(m_state.get_cuda_stream()) + , m_criterion_params{exec_state.get_criterion_params()} + , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())} + , m_run_once{exec_state.get_run_once()} + , m_min_samples{exec_state.get_min_samples()} + , m_skip_time{exec_state.get_skip_time()} + , m_timeout{exec_state.get_timeout()} +{ + if (m_min_samples > 0) + { + m_cpu_times.reserve(static_cast(m_min_samples)); + } +} + +void measure_cpu_only_base::check() +{ + // no-op +} + +void measure_cpu_only_base::initialize() +{ + m_total_cpu_time = 0.; + m_cpu_noise = 0.; + m_total_samples = 0; + m_cpu_times.clear(); + m_max_time_exceeded = false; + + m_stopping_criterion.initialize(m_criterion_params); +} + +void measure_cpu_only_base::run_trials_prologue() { m_walltime_timer.start(); } + +void measure_cpu_only_base::record_measurements() +{ + // Update and record timers and counters: + const auto cur_cpu_time = m_cpu_timer.get_duration(); + m_cpu_times.push_back(cur_cpu_time); + m_total_cpu_time += cur_cpu_time; + ++m_total_samples; + + m_stopping_criterion.add_measurement(cur_cpu_time); +} + +bool measure_cpu_only_base::is_finished() +{ + if (m_run_once) + { + return true; + } + + // Check that we've gathered enough samples: + if (m_total_samples > m_min_samples) + { + if (m_stopping_criterion.is_finished()) + { + return true; + } + } + + // Check for timeouts: + m_walltime_timer.stop(); + if (m_walltime_timer.get_duration() > m_timeout) + { + m_max_time_exceeded = true; + return true; + } + + return false; +} + +void measure_cpu_only_base::run_trials_epilogue() +{ + // Only need to compute this at the end, not per iteration. + const auto cpu_mean = m_total_cpu_time / static_cast(m_total_samples); + const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(), + m_cpu_times.cend(), + cpu_mean); + m_cpu_noise = cpu_stdev / cpu_mean; + + m_walltime_timer.stop(); +} + +void measure_cpu_only_base::generate_summaries() +{ + { + auto &summ = m_state.add_summary("nv/cpu_only/sample_size"); + summ.set_string("name", "Samples"); + summ.set_string("hint", "sample_size"); + summ.set_string("description", "Number of isolated kernel executions"); + summ.set_int64("value", m_total_samples); + } + + const auto d_samples = static_cast(m_total_samples); + const auto avg_cpu_time = m_total_cpu_time / d_samples; + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/mean"); + summ.set_string("name", "CPU Time"); + summ.set_string("hint", "duration"); + summ.set_string("description", + "Mean isolated kernel execution time " + "(measured on host CPU)"); + summ.set_float64("value", avg_cpu_time); + } + + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/stdev/relative"); + summ.set_string("name", "Noise"); + summ.set_string("hint", "percentage"); + summ.set_string("description", "Relative standard deviation of isolated CPU times"); + summ.set_float64("value", m_cpu_noise); + } + + if (const auto items = m_state.get_element_count(); items != 0) + { + auto &summ = m_state.add_summary("nv/cpu_only/bw/item_rate"); + summ.set_string("name", "Elem/s"); + summ.set_string("hint", "item_rate"); + summ.set_string("description", "Number of input elements processed per second"); + summ.set_float64("value", static_cast(items) / avg_cpu_time); + } + + if (const auto bytes = m_state.get_global_memory_rw_bytes(); bytes != 0) + { + const auto avg_used_gmem_bw = static_cast(bytes) / avg_cpu_time; + { + auto &summ = m_state.add_summary("nv/cpu_only/bw/global/bytes_per_second"); + summ.set_string("name", "GlobalMem BW"); + summ.set_string("hint", "byte_rate"); + summ.set_string("description", + "Number of bytes read/written per second."); + summ.set_float64("value", avg_used_gmem_bw); + } + } // bandwidth + + { + auto &summ = m_state.add_summary("nv/cpu_only/walltime"); + summ.set_string("name", "Walltime"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Walltime used for isolated measurements"); + summ.set_float64("value", m_walltime_timer.get_duration()); + summ.set_string("hide", "Hidden by default."); + } + + // Log if a printer exists: + if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value()) + { + auto &printer = printer_opt_ref.value().get(); + + if (m_max_time_exceeded) + { + const auto timeout = m_walltime_timer.get_duration(); + const auto max_noise = m_criterion_params.get_float64("max-noise"); + const auto min_time = m_criterion_params.get_float64("min-time"); + + if (m_cpu_noise > max_noise) + { + printer.log(nvbench::log_level::warn, + fmt::format("Current measurement timed out ({:0.2f}s) " + "while over noise threshold ({:0.2f}% > " + "{:0.2f}%)", + timeout, + m_cpu_noise * 100, + max_noise * 100)); + } + if (m_total_samples < m_min_samples) + { + printer.log(nvbench::log_level::warn, + fmt::format("Current measurement timed out ({:0.2f}s) " + "before accumulating min_samples ({} < {})", + timeout, + m_total_samples, + m_min_samples)); + } + if (m_total_cpu_time < min_time) + { + printer.log(nvbench::log_level::warn, + fmt::format("Current measurement timed out ({:0.2f}s) " + "before accumulating min_time ({:0.2f}s < " + "{:0.2f}s)", + timeout, + m_total_cpu_time, + min_time)); + } + } + + // Log to stdout: + printer.log(nvbench::log_level::pass, + fmt::format("CpuOnly: {:0.6f}ms mean CPU, {:0.2f}s total CPU, " + "{:0.2f}s total wall, {}x ", + avg_cpu_time * 1e3, + m_total_cpu_time, + m_walltime_timer.get_duration(), + m_total_samples)); + + printer.process_bulk_data(m_state, "nv/cpu_only/sample_times", "sample_times", m_cpu_times); + } +} + +void measure_cpu_only_base::check_skip_time(nvbench::float64_t warmup_time) +{ + if (m_skip_time > 0. && warmup_time < m_skip_time) + { + auto reason = fmt::format("Warmup time did not meet skip_time limit: " + "{:0.3f}us < {:0.3f}us.", + warmup_time * 1e6, + m_skip_time * 1e6); + + m_state.skip(reason); + NVBENCH_THROW(std::runtime_error, "{}", std::move(reason)); + } +} + +} // namespace nvbench::detail diff --git a/nvbench/detail/state_exec.cuh b/nvbench/detail/state_exec.cuh index b87f487..f36d2a1 100644 --- a/nvbench/detail/state_exec.cuh +++ b/nvbench/detail/state_exec.cuh @@ -24,15 +24,16 @@ #endif // NVBENCH_STATE_EXEC_GUARD #include +#include +#include +#include +#include #include #include -#include #ifdef NVBENCH_HAS_CUPTI #include #endif // NVBENCH_HAS_CUPTI -#include -#include #include @@ -45,21 +46,44 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher) using KL = typename std::remove_reference::type; using namespace nvbench::exec_tag::impl; static_assert(is_exec_tag_v, - "`ExecTags` argument must be a member (or combination of " - "members) from nvbench::exec_tag."); + "`ExecTags` argument must be a member (or combination of members) from " + "`nvbench::exec_tag`."); + static_assert(!((tags & gpu) && (tags & no_gpu)), + "`nvbench::exec_tag::gpu` and `nvbench::exec_tag::no_gpu` are mutually " + "exclusive."); - constexpr auto measure_tags = tags & measure_mask; constexpr auto modifier_tags = tags & modifier_mask; + constexpr auto measure_tags = tags & measure_mask; - // "run once" is handled by the cold measurement: + if ((modifier_tags & no_gpu) && !this->get_is_cpu_only()) + { + throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that " + "`set_is_cpu_only(true)` is called when defining the benchmark."); + } + + if ((modifier_tags & gpu) && this->get_is_cpu_only()) + { + throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that " + "`set_is_cpu_only(true)` is NOT called when defining the benchmark."); + } + + // "run once" should disable batch measurements: + // TODO This should just be a runtime branch in measure_cold. Currently this causes two versions + // of measure_cold to be compiled. We don't expose the `run_once` tag to users, it should be + // removed. + // TODO CPU measurements should support run_once as well. if (!(modifier_tags & run_once) && this->get_run_once()) { - constexpr auto run_once_tags = modifier_tags | cold | run_once; + constexpr auto run_once_tags = modifier_tags | run_once | (measure_tags & ~hot); this->exec(run_once_tags, std::forward(kernel_launcher)); return; } - if (!(modifier_tags & no_block) && this->get_disable_blocking_kernel()) + // TODO The `no_block` tag should be removed and replaced with a runtime branch in measure_cold + // and measure_hot. Currently this causes unnecesaary codegen. Note that the `sync` exec_tag + // implies `no_block` when refactoring. + if (!(measure_tags & cpu_only) && !(modifier_tags & no_block) && + this->get_disable_blocking_kernel()) { constexpr auto no_block_tags = tags | no_block; this->exec(no_block_tags, std::forward(kernel_launcher)); @@ -69,14 +93,32 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher) // If no measurements selected, pick some defaults based on the modifiers: if constexpr (!measure_tags) { - if constexpr (modifier_tags & (timer | sync)) - { // Can't do hot timings with manual timer or sync; whole point is to not - // sync in between executions. - this->exec(cold | tags, std::forward(kernel_launcher)); - } - else + if constexpr (modifier_tags & gpu) { - this->exec(cold | hot | tags, std::forward(kernel_launcher)); + if constexpr (modifier_tags & no_batch) + { + this->exec(cold | modifier_tags, std::forward(kernel_launcher)); + } + else + { + this->exec(cold | hot | modifier_tags, std::forward(kernel_launcher)); + } + } + else if constexpr (modifier_tags & no_gpu) + { + this->exec(cpu_only | modifier_tags, std::forward(kernel_launcher)); + } + else // Instantiate both CPU and GPU measurement code: + { + if constexpr (modifier_tags & no_batch) + { + this->exec(cold | cpu_only | modifier_tags, std::forward(kernel_launcher)); + } + else + { + this->exec(cold | hot | cpu_only | modifier_tags, + std::forward(kernel_launcher)); + } } return; } @@ -86,62 +128,90 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher) return; } - // Each measurement is deliberately isolated in constexpr branches to - // avoid instantiating unused measurements. - if constexpr (tags & cold) + if (this->get_is_cpu_only()) { - constexpr bool use_blocking_kernel = !(tags & no_block); - if constexpr (tags & timer) + if constexpr (tags & cpu_only) // Prevent instantiation when not needed { + static_assert(!(tags & gpu), "CPU-only measurement doesn't support the `gpu` exec_tag."); + + if constexpr (tags & timer) + { + using measure_t = nvbench::detail::measure_cpu_only; + measure_t measure{*this, kernel_launcher}; + measure(); + } + else + { // Need to wrap the kernel launcher with a timer wrapper: + using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper; + wrapper_t wrapper{kernel_launcher}; + + using measure_t = nvbench::detail::measure_cpu_only; + measure_t measure(*this, wrapper); + measure(); + } + } + } + else + { + if constexpr (tags & cold) // Prevent instantiation when not needed + { + static_assert(!(tags & no_gpu), "Cold measurement doesn't support the `no_gpu` exec_tag."); + + constexpr bool use_blocking_kernel = !(tags & no_block); + if constexpr (tags & timer) + { // Estimate bandwidth here #ifdef NVBENCH_HAS_CUPTI - if constexpr (!(modifier_tags & run_once)) - { - if (this->is_cupti_required()) + if constexpr (!(modifier_tags & run_once)) { - using measure_t = nvbench::detail::measure_cupti; - measure_t measure{*this, kernel_launcher}; - measure(); + if (this->is_cupti_required()) + { + using measure_t = nvbench::detail::measure_cupti; + measure_t measure{*this, kernel_launcher}; + measure(); + } } - } #endif - using measure_t = nvbench::detail::measure_cold; + using measure_t = nvbench::detail::measure_cold; + measure_t measure{*this, kernel_launcher}; + measure(); + } + else + { // Need to wrap the kernel launcher with a timer wrapper: + using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper; + wrapper_t wrapper{kernel_launcher}; + +// Estimate bandwidth here +#ifdef NVBENCH_HAS_CUPTI + if constexpr (!(modifier_tags & run_once)) + { + if (this->is_cupti_required()) + { + using measure_t = nvbench::detail::measure_cupti; + measure_t measure{*this, wrapper}; + measure(); + } + } +#endif + + using measure_t = nvbench::detail::measure_cold; + measure_t measure(*this, wrapper); + measure(); + } + } + + if constexpr (tags & hot) // Prevent instantiation when not needed + { + static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag."); + static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag."); + static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag."); + static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag."); + constexpr bool use_blocking_kernel = !(tags & no_block); + using measure_t = nvbench::detail::measure_hot; measure_t measure{*this, kernel_launcher}; measure(); } - else - { // Need to wrap the kernel launcher with a timer wrapper: - using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper; - wrapper_t wrapper{kernel_launcher}; - -// Estimate bandwidth here -#ifdef NVBENCH_HAS_CUPTI - if constexpr (!(modifier_tags & run_once)) - { - if (this->is_cupti_required()) - { - using measure_t = nvbench::detail::measure_cupti; - measure_t measure{*this, wrapper}; - measure(); - } - } -#endif - - using measure_t = nvbench::detail::measure_cold; - measure_t measure(*this, wrapper); - measure(); - } - } - - if constexpr (tags & hot) - { - static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag."); - static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag."); - constexpr bool use_blocking_kernel = !(tags & no_block); - using measure_t = nvbench::detail::measure_hot; - measure_t measure{*this, kernel_launcher}; - measure(); } } diff --git a/nvbench/exec_tag.cuh b/nvbench/exec_tag.cuh index c935e4c..64dbf2b 100644 --- a/nvbench/exec_tag.cuh +++ b/nvbench/exec_tag.cuh @@ -35,12 +35,17 @@ enum class exec_flag no_block = 0x02, // Disables use of `blocking_kernel`. sync = 0x04, // KernelLauncher has indicated that it will sync run_once = 0x08, // Only run the benchmark once (for profiling). - modifier_mask = timer | no_block | sync | run_once, + gpu = 0x10, // Don't instantiate `measure_cpu_only`. + no_gpu = 0x20, // No GPU measurements should be instantiated. + no_batch = 0x40, // `measure_hot` will not be used. + modifier_mask = 0xFF, - // Measurement types: + // Measurement types to instantiate. Derived from modifiers. + // Should not be exposed directly via nvbench::exec_tag::<...>. cold = 0x0100, // measure_cold hot = 0x0200, // measure_hot - measure_mask = cold | hot + cpu_only = 0x0400, // measure_cpu_only + measure_mask = 0xFF00, }; } // namespace nvbench::detail @@ -95,9 +100,14 @@ using timer_t = tag; using no_block_t = tag; using sync_t = tag; using run_once_t = tag; +using gpu_t = tag; +using no_gpu_t = tag; +using no_batch_t = tag; +using modifier_mask_t = tag; + using hot_t = tag; using cold_t = tag; -using modifier_mask_t = tag; +using cpu_only_t = tag; using measure_mask_t = tag; constexpr inline none_t none; @@ -105,9 +115,14 @@ constexpr inline timer_t timer; constexpr inline no_block_t no_block; constexpr inline sync_t sync; constexpr inline run_once_t run_once; +constexpr inline gpu_t gpu; +constexpr inline no_gpu_t no_gpu; +constexpr inline no_batch_t no_batch; +constexpr inline modifier_mask_t modifier_mask; + constexpr inline cold_t cold; constexpr inline hot_t hot; -constexpr inline modifier_mask_t modifier_mask; +constexpr inline cpu_only_t cpu_only; constexpr inline measure_mask_t measure_mask; } // namespace impl @@ -116,13 +131,26 @@ constexpr inline auto none = nvbench::exec_tag::impl::none; /// Modifier used when only a portion of the KernelLauncher needs to be timed. /// Useful for resetting state in-between timed kernel launches. -constexpr inline auto timer = nvbench::exec_tag::impl::timer; +constexpr inline auto timer = nvbench::exec_tag::impl::timer | // + nvbench::exec_tag::impl::no_batch; /// Modifier used to indicate that the KernelGenerator will perform CUDA /// synchronizations. Without this flag such benchmarks will deadlock. -constexpr inline auto sync = nvbench::exec_tag::impl::no_block | nvbench::exec_tag::impl::sync; +constexpr inline auto sync = nvbench::exec_tag::impl::no_block | // + nvbench::exec_tag::impl::sync | // + nvbench::exec_tag::impl::no_batch; /// Modifier used to indicate that batched measurements should be disabled -constexpr inline auto no_batch = nvbench::exec_tag::impl::cold; +constexpr inline auto no_batch = nvbench::exec_tag::impl::no_batch; + +/// Optional optimization for CPU-only benchmarks. Requires that `set_is_cpu_only(true)` +/// is called when defining the benchmark. Passing this exec_tag will ensure that +/// GPU measurement code is not instantiated. +constexpr inline auto no_gpu = nvbench::exec_tag::impl::no_gpu; + +/// Optional optimization for GPU benchmarks. Requires that `set_is_cpu_only(true)` +/// is NOT called when defining the benchmark. Passing this exec_tag will prevent unused CPU-only +/// measurement code from being instantiated. +constexpr inline auto gpu = nvbench::exec_tag::impl::gpu; } // namespace nvbench::exec_tag diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu index a1e8b1d..263d230 100644 --- a/nvbench/markdown_printer.cu +++ b/nvbench/markdown_printer.cu @@ -227,7 +227,7 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_ const auto &devices = bench.get_devices(); const auto &axes = bench.get_axes(); - fmt::format_to(std::back_inserter(buffer), "\n## {}\n", bench.get_name()); + fmt::format_to(std::back_inserter(buffer), "\n## {}\n\n", bench.get_name()); // Do a single pass when no devices are specified. This happens for // benchmarks with `cpu` exec_tags. @@ -240,7 +240,10 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_ if (device) { - fmt::format_to(std::back_inserter(buffer), "\n### [{}] {}\n\n", device->get_id(), device->get_name()); + fmt::format_to(std::back_inserter(buffer), + "### [{}] {}\n\n", + device->get_id(), + device->get_name()); } std::size_t row = 0; diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index 89f4c93..525f275 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -799,7 +799,10 @@ try else { benchmark_base &bench = *m_benchmarks.back(); - bench.set_devices(device_vec); + if (!bench.get_is_cpu_only()) + { + bench.set_devices(device_vec); + } } m_recent_devices = std::move(device_vec); diff --git a/nvbench/state.cuh b/nvbench/state.cuh index bc4791e..86e6d16 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -71,6 +71,12 @@ struct state /// nullopt for CPU-only benchmarks. [[nodiscard]] const std::optional &get_device() const { return m_device; } + /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed. + /// @{ + // No setter, this should not be modified after construction, as it is a benchmark-wide property. + [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; } + /// @} + /// An index into a benchmark::type_configs type_list. Returns 0 if no type /// axes in the associated benchmark. [[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; } @@ -293,6 +299,7 @@ private: std::optional m_device; std::size_t m_type_config_index{}; + bool m_is_cpu_only{false}; bool m_run_once{false}; bool m_disable_blocking_kernel{false}; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index 1be48c5..8779e9f 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -34,6 +34,7 @@ namespace nvbench state::state(const benchmark_base &bench) : m_benchmark{bench} + , m_is_cpu_only(bench.get_is_cpu_only()) , m_run_once{bench.get_run_once()} , m_disable_blocking_kernel{bench.get_disable_blocking_kernel()} , m_criterion_params{bench.get_criterion_params()} @@ -51,6 +52,7 @@ state::state(const benchmark_base &bench, , m_axis_values{std::move(values)} , m_device{std::move(device)} , m_type_config_index{type_config_index} + , m_is_cpu_only(bench.get_is_cpu_only()) , m_run_once{bench.get_run_once()} , m_disable_blocking_kernel{bench.get_disable_blocking_kernel()} , m_criterion_params{bench.get_criterion_params()}