mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
Add support for CPU-only benchmarking.
Fixes #95. CPU-only mode is enabled by setting the `is_cpu_only` property while defining a benchmark, e.g. `NVBENCH_BENCH(foo).set_is_cpu_only(true)`. An optional `nvbench::exec_tag::no_gpu` hint can also be passed to `state.exec` to avoid instantiating GPU benchmarking backends. Note that a CUDA compiler and CUDA runtime are always required, even if all benchmarks in a translation unit are CPU-only. Similarly, a new `nvbench::exec_tag::gpu` hint can be used to avoid compiling CPU-only backends for GPU benchmarks.
This commit is contained in:
@@ -25,6 +25,9 @@ features:
|
||||
* Batch Measurements:
|
||||
* Executes the benchmark multiple times back-to-back and records total time.
|
||||
* Reports the average execution time (total time / number of executions).
|
||||
* [CPU-only Measurements](docs/benchmarks.md#cpu-only-benchmarks)
|
||||
* Measures the host-side execution time of a non-GPU benchmark.
|
||||
* Not suitable for microbenchmarking.
|
||||
|
||||
# Supported Compilers and Tools
|
||||
|
||||
@@ -65,6 +68,7 @@ This repository provides a number of [examples](examples/) that demonstrate
|
||||
various NVBench features and usecases:
|
||||
|
||||
- [Runtime and compile-time parameter sweeps](examples/axes.cu)
|
||||
- [CPU-only benchmarking](examples/cpu_only.cu)
|
||||
- [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
|
||||
- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
|
||||
- [Skipping benchmark configurations](examples/skip.cu)
|
||||
@@ -171,6 +175,7 @@ testing and parameter tuning of individual kernels. For in-depth analysis of
|
||||
end-to-end performance of multiple applications, the NVIDIA Nsight tools are
|
||||
more appropriate.
|
||||
|
||||
NVBench is focused on evaluating the performance of CUDA kernels and is not
|
||||
optimized for CPU microbenchmarks. This may change in the future, but for now,
|
||||
NVBench is focused on evaluating the performance of CUDA kernels. It also provides
|
||||
CPU-only benchmarking facilities intended for non-trivial CPU workloads, but is
|
||||
not optimized for CPU microbenchmarks. This may change in the future, but for now,
|
||||
consider using Google Benchmark for high resolution CPU benchmarks.
|
||||
|
||||
@@ -4,7 +4,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:
|
||||
|
||||
```cpp
|
||||
void my_benchmark(nvbench::state& state) {
|
||||
state.exec([](nvbench::launch& launch) {
|
||||
state.exec([](nvbench::launch& launch) {
|
||||
my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
|
||||
});
|
||||
}
|
||||
@@ -97,7 +97,7 @@ void benchmark(nvbench::state& state)
|
||||
const auto num_inputs = state.get_int64("NumInputs");
|
||||
thrust::device_vector<int> data = generate_input(num_inputs);
|
||||
|
||||
state.exec([&data](nvbench::launch& launch) {
|
||||
state.exec([&data](nvbench::launch& launch) {
|
||||
my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
|
||||
});
|
||||
}
|
||||
@@ -134,7 +134,7 @@ void benchmark(nvbench::state& state)
|
||||
const auto quality = state.get_float64("Quality");
|
||||
|
||||
state.exec([&quality](nvbench::launch& launch)
|
||||
{
|
||||
{
|
||||
my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
|
||||
});
|
||||
}
|
||||
@@ -153,7 +153,7 @@ void benchmark(nvbench::state& state)
|
||||
thrust::device_vector<int> data = generate_input(rng_dist);
|
||||
|
||||
state.exec([&data](nvbench::launch& launch)
|
||||
{
|
||||
{
|
||||
my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
|
||||
});
|
||||
}
|
||||
@@ -182,7 +182,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
|
||||
thrust::device_vector<T> data = generate_input<T>();
|
||||
|
||||
state.exec([&data](nvbench::launch& launch)
|
||||
{
|
||||
{
|
||||
my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
|
||||
});
|
||||
}
|
||||
@@ -266,7 +266,6 @@ In general::
|
||||
|
||||
More examples can found in [examples/throughput.cu](../examples/throughput.cu).
|
||||
|
||||
|
||||
# Skip Uninteresting / Invalid Benchmarks
|
||||
|
||||
Sometimes particular combinations of parameters aren't useful or interesting —
|
||||
@@ -294,7 +293,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
|
||||
// Skip benchmarks at compile time -- for example, always skip when T == U
|
||||
// (Note that the `type_list` argument defines the same type twice).
|
||||
template <typename SameType>
|
||||
void my_benchmark(nvbench::state& state,
|
||||
void my_benchmark(nvbench::state& state,
|
||||
nvbench::type_list<SameType, SameType>)
|
||||
{
|
||||
state.skip("T must not be the same type as U.");
|
||||
@@ -320,6 +319,15 @@ true:
|
||||
synchronize internally.
|
||||
- `nvbench::exec_tag::timer` requests a timer object that can be used to
|
||||
restrict the timed region.
|
||||
- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables
|
||||
them during execution to reduce runtime, and prevents their compilation to
|
||||
reduce compile-time and binary size.
|
||||
- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking
|
||||
code from being compiled for a particular benchmark. A runtime error is emitted
|
||||
if the benchmark is defined with `set_is_cpu_only(true)`.
|
||||
- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking
|
||||
code from being compiled for a particular benchmark. A runtime error is emitted
|
||||
if the benchmark does not also define `set_is_cpu_only(true)`.
|
||||
|
||||
Multiple execution tags may be combined using `operator|`, e.g.
|
||||
|
||||
@@ -370,7 +378,7 @@ Note that using manual timer mode disables batch measurements.
|
||||
void timer_example(nvbench::state& state)
|
||||
{
|
||||
// Pass the `timer` exec tag to request a timer:
|
||||
state.exec(nvbench::exec_tag::timer,
|
||||
state.exec(nvbench::exec_tag::timer,
|
||||
// Lambda now accepts a timer:
|
||||
[](nvbench::launch& launch, auto& timer)
|
||||
{
|
||||
@@ -391,6 +399,79 @@ NVBENCH_BENCH(timer_example);
|
||||
See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete
|
||||
example.
|
||||
|
||||
## Compilation hints: `nvbench::exec_tag::no_batch`, `gpu`, and `no_gpu`
|
||||
|
||||
These execution tags are optional hints that disable the compilation of various
|
||||
code paths when they are not needed. They apply only to a single benchmark.
|
||||
|
||||
- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend.
|
||||
- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends.
|
||||
- Requires that the benchmark does not define `set_is_cpu_only(true)`.
|
||||
- Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
|
||||
- Host-side CPU measurements of GPU kernel execution time are still provided.
|
||||
- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends.
|
||||
- Requires that the benchmark defines `set_is_cpu_only(true)`.
|
||||
- Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
|
||||
- See also [CPU-only Benchmarks](#cpu-only-benchmarks).
|
||||
|
||||
# CPU-only Benchmarks
|
||||
|
||||
NVBench provides CPU-only benchmarking facilities that are intended for measuring
|
||||
significant CPU workloads. We do not recommend using these features for high-resolution
|
||||
CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for
|
||||
such applications. Examples are provided in [examples/cpu_only.cu](../examples/cpu_only.cu).
|
||||
|
||||
Note that NVBench still requires a CUDA compiler and runtime even if a project only contains
|
||||
CPU-only benchmarks.
|
||||
|
||||
The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements:
|
||||
|
||||
```cpp
|
||||
void my_cpu_benchmark(nvbench::state &state)
|
||||
{
|
||||
state.exec([](nvbench::launch &) { /* workload */ });
|
||||
}
|
||||
NVBENCH_BENCH(my_cpu_benchmark)
|
||||
.set_is_cpu_only(true); // Mark as CPU-only.
|
||||
```
|
||||
|
||||
The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and
|
||||
binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while
|
||||
`is_cpu_only` is false.
|
||||
|
||||
```cpp
|
||||
void my_cpu_benchmark(nvbench::state &state)
|
||||
{
|
||||
state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends
|
||||
[](nvbench::launch &) { /* workload */ });
|
||||
}
|
||||
NVBENCH_BENCH(my_cpu_benchmark)
|
||||
.set_is_cpu_only(true); // Mark as CPU-only.
|
||||
```
|
||||
|
||||
The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This
|
||||
is useful for benchmarks that require additional per-sample setup/teardown. See the
|
||||
[`nvbench::exec_tag::timer`](#explicit-timer-mode-nvbenchexec_tagtimer) section for more
|
||||
details.
|
||||
|
||||
```cpp
|
||||
void my_cpu_benchmark(nvbench::state &state)
|
||||
{
|
||||
state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends
|
||||
nvbench::exec_tag::timer, // Request a timer object
|
||||
[](nvbench::launch &, auto &timer)
|
||||
{
|
||||
// Setup here
|
||||
timer.start();
|
||||
// timed workload
|
||||
timer.stop();
|
||||
// teardown here
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(my_cpu_benchmark)
|
||||
.set_is_cpu_only(true); // Mark as CPU-only.
|
||||
```
|
||||
|
||||
# Beware: Combinatorial Explosion Is Lurking
|
||||
|
||||
Be very careful of how quickly the configuration space can grow. The following
|
||||
@@ -403,7 +484,7 @@ using value_types = nvbench::type_list<nvbench::uint8_t,
|
||||
nvbench::int32_t,
|
||||
nvbench::float32_t,
|
||||
nvbench::float64_t>;
|
||||
using op_types = nvbench::type_list<thrust::plus<>,
|
||||
using op_types = nvbench::type_list<thrust::plus<>,
|
||||
thrust::multiplies<>,
|
||||
thrust::maximum<>>;
|
||||
|
||||
@@ -418,7 +499,7 @@ NVBENCH_BENCH_TYPES(my_benchmark,
|
||||
|
||||
```
|
||||
960 total configs
|
||||
= 4 [T=(U8, I32, F32, F64)]
|
||||
= 4 [T=(U8, I32, F32, F64)]
|
||||
* 4 [U=(U8, I32, F32, F64)]
|
||||
* 4 [V=(U8, I32, F32, F64)]
|
||||
* 3 [Op=(plus, multiplies, max)]
|
||||
@@ -427,8 +508,8 @@ NVBENCH_BENCH_TYPES(my_benchmark,
|
||||
|
||||
For large configuration spaces like this, pruning some of the less useful
|
||||
combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques
|
||||
described in the "Skip Uninteresting / Invalid Benchmarks" section can help
|
||||
immensely with keeping compile / run times manageable.
|
||||
described in the [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting--invalid-benchmarks)
|
||||
section can help immensely with keeping compile / run times manageable.
|
||||
|
||||
Splitting a single large configuration space into multiple, more focused
|
||||
benchmarks with reduced dimensionality will likely be worth the effort as well.
|
||||
|
||||
@@ -2,6 +2,7 @@ set(example_srcs
|
||||
auto_throughput.cu
|
||||
axes.cu
|
||||
custom_criterion.cu
|
||||
cpu_only.cu
|
||||
enums.cu
|
||||
exec_tag_sync.cu
|
||||
exec_tag_timer.cu
|
||||
|
||||
83
examples/cpu_only.cu
Normal file
83
examples/cpu_only.cu
Normal file
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright 2025 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
// Block execution of the current CPU thread for `seconds` seconds.
|
||||
void sleep_host(double seconds)
|
||||
{
|
||||
std::this_thread::sleep_for(
|
||||
std::chrono::milliseconds(static_cast<nvbench::int64_t>(seconds * 1000)));
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
// Simple CPU-only benchmark that sleeps on host for a specified duration.
|
||||
void simple(nvbench::state &state)
|
||||
{
|
||||
const auto duration = state.get_float64("Duration");
|
||||
|
||||
state.exec([duration](nvbench::launch &) { sleep_host(duration); });
|
||||
}
|
||||
NVBENCH_BENCH(simple)
|
||||
// 100 -> 500 ms in 100 ms increments.
|
||||
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
|
||||
// Mark as CPU-only.
|
||||
.set_is_cpu_only(true);
|
||||
|
||||
//=============================================================================
|
||||
// Simple CPU-only benchmark that sleeps on host for a specified duration and
|
||||
// uses a custom timed region.
|
||||
void simple_timer(nvbench::state &state)
|
||||
{
|
||||
const auto duration = state.get_float64("Duration");
|
||||
|
||||
state.exec(nvbench::exec_tag::timer, [duration](nvbench::launch &, auto &timer) {
|
||||
// Do any setup work before starting the timer here...
|
||||
timer.start();
|
||||
|
||||
// The region of code to be timed:
|
||||
sleep_host(duration);
|
||||
|
||||
timer.stop();
|
||||
// Any per-run cleanup here...
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(simple_timer)
|
||||
// 100 -> 500 ms in 100 ms increments.
|
||||
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
|
||||
// Mark as CPU-only.
|
||||
.set_is_cpu_only(true);
|
||||
|
||||
//=============================================================================
|
||||
// Simple CPU-only benchmark that uses the optional `nvbench::exec_tag::no_gpu`
|
||||
// hint to prevent GPU measurement code from being instantiated. Note that
|
||||
// `set_is_cpu_only(true)` is still required when using this hint.
|
||||
void simple_no_gpu(nvbench::state &state)
|
||||
{
|
||||
const auto duration = state.get_float64("Duration");
|
||||
|
||||
state.exec(nvbench::exec_tag::no_gpu, [duration](nvbench::launch &) { sleep_host(duration); });
|
||||
}
|
||||
NVBENCH_BENCH(simple_no_gpu)
|
||||
// 100 -> 500 ms in 100 ms increments.
|
||||
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
|
||||
// Mark as CPU-only.
|
||||
.set_is_cpu_only(true);
|
||||
@@ -25,6 +25,7 @@ set(srcs
|
||||
|
||||
detail/entropy_criterion.cxx
|
||||
detail/measure_cold.cu
|
||||
detail/measure_cpu_only.cxx
|
||||
detail/measure_hot.cu
|
||||
detail/state_generator.cxx
|
||||
detail/stdrel_criterion.cxx
|
||||
|
||||
@@ -159,6 +159,16 @@ struct benchmark_base
|
||||
}
|
||||
/// @}
|
||||
|
||||
/// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
|
||||
/// @{
|
||||
[[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
|
||||
benchmark_base &set_is_cpu_only(bool is_cpu_only)
|
||||
{
|
||||
m_is_cpu_only = is_cpu_only;
|
||||
return *this;
|
||||
}
|
||||
/// @}
|
||||
|
||||
/// If true, the benchmark is only run once, skipping all warmup runs and only
|
||||
/// executing a single non-batched measurement. This is intended for use with
|
||||
/// external profiling tools. @{
|
||||
@@ -263,6 +273,7 @@ protected:
|
||||
|
||||
optional_ref<nvbench::printer_base> m_printer;
|
||||
|
||||
bool m_is_cpu_only{false};
|
||||
bool m_run_once{false};
|
||||
bool m_disable_blocking_kernel{false};
|
||||
|
||||
|
||||
@@ -34,12 +34,18 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
|
||||
result->m_axes = m_axes;
|
||||
result->m_devices = m_devices;
|
||||
|
||||
result->m_min_samples = m_min_samples;
|
||||
result->m_criterion_params = m_criterion_params;
|
||||
result->m_printer = m_printer;
|
||||
|
||||
result->m_is_cpu_only = m_is_cpu_only;
|
||||
result->m_run_once = m_run_once;
|
||||
result->m_disable_blocking_kernel = m_disable_blocking_kernel;
|
||||
|
||||
result->m_min_samples = m_min_samples;
|
||||
|
||||
result->m_skip_time = m_skip_time;
|
||||
result->m_timeout = m_timeout;
|
||||
|
||||
result->m_criterion_params = m_criterion_params;
|
||||
result->m_stopping_criterion = m_stopping_criterion;
|
||||
|
||||
return result;
|
||||
|
||||
@@ -40,7 +40,10 @@ void benchmark_manager::initialize()
|
||||
const auto& mgr = device_manager::get();
|
||||
for (auto& bench : m_benchmarks)
|
||||
{
|
||||
bench->set_devices(mgr.get_devices());
|
||||
if (!bench->get_is_cpu_only())
|
||||
{
|
||||
bench->set_devices(mgr.get_devices());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
141
nvbench/detail/measure_cpu_only.cuh
Normal file
141
nvbench/detail/measure_cpu_only.cuh
Normal file
@@ -0,0 +1,141 @@
|
||||
/*
|
||||
* Copyright 2021-2025 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <nvbench/cpu_timer.cuh>
|
||||
#include <nvbench/exec_tag.cuh>
|
||||
#include <nvbench/launch.cuh>
|
||||
#include <nvbench/stopping_criterion.cuh>
|
||||
|
||||
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
|
||||
#include <nvbench/detail/statistics.cuh>
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace nvbench
|
||||
{
|
||||
|
||||
struct state;
|
||||
|
||||
namespace detail
|
||||
{
|
||||
|
||||
// non-templated code goes here:
|
||||
struct measure_cpu_only_base
|
||||
{
|
||||
explicit measure_cpu_only_base(nvbench::state &exec_state);
|
||||
measure_cpu_only_base(const measure_cpu_only_base &) = delete;
|
||||
measure_cpu_only_base(measure_cpu_only_base &&) = delete;
|
||||
measure_cpu_only_base &operator=(const measure_cpu_only_base &) = delete;
|
||||
measure_cpu_only_base &operator=(measure_cpu_only_base &&) = delete;
|
||||
|
||||
protected:
|
||||
void check();
|
||||
void initialize();
|
||||
void run_trials_prologue();
|
||||
void record_measurements();
|
||||
bool is_finished();
|
||||
void run_trials_epilogue();
|
||||
void generate_summaries();
|
||||
|
||||
void check_skip_time(nvbench::float64_t warmup_time);
|
||||
|
||||
nvbench::state &m_state;
|
||||
|
||||
// Required to satisfy the KernelLauncher interface:
|
||||
nvbench::launch m_launch;
|
||||
|
||||
nvbench::cpu_timer m_cpu_timer;
|
||||
nvbench::cpu_timer m_walltime_timer;
|
||||
|
||||
nvbench::criterion_params m_criterion_params;
|
||||
nvbench::stopping_criterion_base& m_stopping_criterion;
|
||||
|
||||
bool m_run_once{false};
|
||||
|
||||
nvbench::int64_t m_min_samples{};
|
||||
|
||||
nvbench::float64_t m_skip_time{};
|
||||
nvbench::float64_t m_timeout{};
|
||||
|
||||
nvbench::int64_t m_total_samples{};
|
||||
nvbench::float64_t m_total_cpu_time{};
|
||||
nvbench::float64_t m_cpu_noise{}; // rel stdev
|
||||
|
||||
std::vector<nvbench::float64_t> m_cpu_times;
|
||||
|
||||
bool m_max_time_exceeded{};
|
||||
};
|
||||
|
||||
template <typename KernelLauncher>
|
||||
struct measure_cpu_only : public measure_cpu_only_base
|
||||
{
|
||||
measure_cpu_only(nvbench::state &state, KernelLauncher &kernel_launcher)
|
||||
: measure_cpu_only_base(state)
|
||||
, m_kernel_launcher{kernel_launcher}
|
||||
{}
|
||||
|
||||
void operator()()
|
||||
{
|
||||
this->check();
|
||||
this->initialize();
|
||||
this->run_warmup();
|
||||
|
||||
this->run_trials_prologue();
|
||||
this->run_trials();
|
||||
this->run_trials_epilogue();
|
||||
|
||||
this->generate_summaries();
|
||||
}
|
||||
|
||||
private:
|
||||
// Run the kernel once, measuring the CPU time. If under skip_time, skip the
|
||||
// measurement.
|
||||
void run_warmup()
|
||||
{
|
||||
if (m_run_once)
|
||||
{ // Skip warmups
|
||||
return;
|
||||
}
|
||||
|
||||
this->launch_kernel(m_cpu_timer);
|
||||
this->check_skip_time(m_cpu_timer.get_duration());
|
||||
}
|
||||
|
||||
void run_trials()
|
||||
{
|
||||
do
|
||||
{
|
||||
this->launch_kernel(m_cpu_timer);
|
||||
this->record_measurements();
|
||||
} while (!this->is_finished());
|
||||
}
|
||||
|
||||
template <typename TimerT>
|
||||
__forceinline__ void launch_kernel(TimerT &timer)
|
||||
{
|
||||
m_kernel_launcher(m_launch, timer);
|
||||
}
|
||||
|
||||
KernelLauncher &m_kernel_launcher;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
} // namespace nvbench
|
||||
246
nvbench/detail/measure_cpu_only.cxx
Normal file
246
nvbench/detail/measure_cpu_only.cxx
Normal file
@@ -0,0 +1,246 @@
|
||||
/*
|
||||
* Copyright 2021-2025 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/benchmark_base.cuh>
|
||||
#include <nvbench/criterion_manager.cuh>
|
||||
#include <nvbench/detail/measure_cpu_only.cuh>
|
||||
#include <nvbench/detail/throw.cuh>
|
||||
#include <nvbench/printer_base.cuh>
|
||||
#include <nvbench/state.cuh>
|
||||
#include <nvbench/summary.cuh>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
namespace nvbench::detail
|
||||
{
|
||||
|
||||
measure_cpu_only_base::measure_cpu_only_base(state &exec_state)
|
||||
: m_state{exec_state}
|
||||
, m_launch(m_state.get_cuda_stream())
|
||||
, m_criterion_params{exec_state.get_criterion_params()}
|
||||
, m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
|
||||
, m_run_once{exec_state.get_run_once()}
|
||||
, m_min_samples{exec_state.get_min_samples()}
|
||||
, m_skip_time{exec_state.get_skip_time()}
|
||||
, m_timeout{exec_state.get_timeout()}
|
||||
{
|
||||
if (m_min_samples > 0)
|
||||
{
|
||||
m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
|
||||
}
|
||||
}
|
||||
|
||||
void measure_cpu_only_base::check()
|
||||
{
|
||||
// no-op
|
||||
}
|
||||
|
||||
void measure_cpu_only_base::initialize()
|
||||
{
|
||||
m_total_cpu_time = 0.;
|
||||
m_cpu_noise = 0.;
|
||||
m_total_samples = 0;
|
||||
m_cpu_times.clear();
|
||||
m_max_time_exceeded = false;
|
||||
|
||||
m_stopping_criterion.initialize(m_criterion_params);
|
||||
}
|
||||
|
||||
void measure_cpu_only_base::run_trials_prologue() { m_walltime_timer.start(); }
|
||||
|
||||
void measure_cpu_only_base::record_measurements()
|
||||
{
|
||||
// Update and record timers and counters:
|
||||
const auto cur_cpu_time = m_cpu_timer.get_duration();
|
||||
m_cpu_times.push_back(cur_cpu_time);
|
||||
m_total_cpu_time += cur_cpu_time;
|
||||
++m_total_samples;
|
||||
|
||||
m_stopping_criterion.add_measurement(cur_cpu_time);
|
||||
}
|
||||
|
||||
bool measure_cpu_only_base::is_finished()
|
||||
{
|
||||
if (m_run_once)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check that we've gathered enough samples:
|
||||
if (m_total_samples > m_min_samples)
|
||||
{
|
||||
if (m_stopping_criterion.is_finished())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for timeouts:
|
||||
m_walltime_timer.stop();
|
||||
if (m_walltime_timer.get_duration() > m_timeout)
|
||||
{
|
||||
m_max_time_exceeded = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void measure_cpu_only_base::run_trials_epilogue()
|
||||
{
|
||||
// Only need to compute this at the end, not per iteration.
|
||||
const auto cpu_mean = m_total_cpu_time / static_cast<nvbench::float64_t>(m_total_samples);
|
||||
const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
|
||||
m_cpu_times.cend(),
|
||||
cpu_mean);
|
||||
m_cpu_noise = cpu_stdev / cpu_mean;
|
||||
|
||||
m_walltime_timer.stop();
|
||||
}
|
||||
|
||||
void measure_cpu_only_base::generate_summaries()
|
||||
{
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cpu_only/sample_size");
|
||||
summ.set_string("name", "Samples");
|
||||
summ.set_string("hint", "sample_size");
|
||||
summ.set_string("description", "Number of isolated kernel executions");
|
||||
summ.set_int64("value", m_total_samples);
|
||||
}
|
||||
|
||||
const auto d_samples = static_cast<nvbench::float64_t>(m_total_samples);
|
||||
const auto avg_cpu_time = m_total_cpu_time / d_samples;
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/mean");
|
||||
summ.set_string("name", "CPU Time");
|
||||
summ.set_string("hint", "duration");
|
||||
summ.set_string("description",
|
||||
"Mean isolated kernel execution time "
|
||||
"(measured on host CPU)");
|
||||
summ.set_float64("value", avg_cpu_time);
|
||||
}
|
||||
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/stdev/relative");
|
||||
summ.set_string("name", "Noise");
|
||||
summ.set_string("hint", "percentage");
|
||||
summ.set_string("description", "Relative standard deviation of isolated CPU times");
|
||||
summ.set_float64("value", m_cpu_noise);
|
||||
}
|
||||
|
||||
if (const auto items = m_state.get_element_count(); items != 0)
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cpu_only/bw/item_rate");
|
||||
summ.set_string("name", "Elem/s");
|
||||
summ.set_string("hint", "item_rate");
|
||||
summ.set_string("description", "Number of input elements processed per second");
|
||||
summ.set_float64("value", static_cast<double>(items) / avg_cpu_time);
|
||||
}
|
||||
|
||||
if (const auto bytes = m_state.get_global_memory_rw_bytes(); bytes != 0)
|
||||
{
|
||||
const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cpu_time;
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cpu_only/bw/global/bytes_per_second");
|
||||
summ.set_string("name", "GlobalMem BW");
|
||||
summ.set_string("hint", "byte_rate");
|
||||
summ.set_string("description",
|
||||
"Number of bytes read/written per second.");
|
||||
summ.set_float64("value", avg_used_gmem_bw);
|
||||
}
|
||||
} // bandwidth
|
||||
|
||||
{
|
||||
auto &summ = m_state.add_summary("nv/cpu_only/walltime");
|
||||
summ.set_string("name", "Walltime");
|
||||
summ.set_string("hint", "duration");
|
||||
summ.set_string("description", "Walltime used for isolated measurements");
|
||||
summ.set_float64("value", m_walltime_timer.get_duration());
|
||||
summ.set_string("hide", "Hidden by default.");
|
||||
}
|
||||
|
||||
// Log if a printer exists:
|
||||
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
|
||||
{
|
||||
auto &printer = printer_opt_ref.value().get();
|
||||
|
||||
if (m_max_time_exceeded)
|
||||
{
|
||||
const auto timeout = m_walltime_timer.get_duration();
|
||||
const auto max_noise = m_criterion_params.get_float64("max-noise");
|
||||
const auto min_time = m_criterion_params.get_float64("min-time");
|
||||
|
||||
if (m_cpu_noise > max_noise)
|
||||
{
|
||||
printer.log(nvbench::log_level::warn,
|
||||
fmt::format("Current measurement timed out ({:0.2f}s) "
|
||||
"while over noise threshold ({:0.2f}% > "
|
||||
"{:0.2f}%)",
|
||||
timeout,
|
||||
m_cpu_noise * 100,
|
||||
max_noise * 100));
|
||||
}
|
||||
if (m_total_samples < m_min_samples)
|
||||
{
|
||||
printer.log(nvbench::log_level::warn,
|
||||
fmt::format("Current measurement timed out ({:0.2f}s) "
|
||||
"before accumulating min_samples ({} < {})",
|
||||
timeout,
|
||||
m_total_samples,
|
||||
m_min_samples));
|
||||
}
|
||||
if (m_total_cpu_time < min_time)
|
||||
{
|
||||
printer.log(nvbench::log_level::warn,
|
||||
fmt::format("Current measurement timed out ({:0.2f}s) "
|
||||
"before accumulating min_time ({:0.2f}s < "
|
||||
"{:0.2f}s)",
|
||||
timeout,
|
||||
m_total_cpu_time,
|
||||
min_time));
|
||||
}
|
||||
}
|
||||
|
||||
// Log to stdout:
|
||||
printer.log(nvbench::log_level::pass,
|
||||
fmt::format("CpuOnly: {:0.6f}ms mean CPU, {:0.2f}s total CPU, "
|
||||
"{:0.2f}s total wall, {}x ",
|
||||
avg_cpu_time * 1e3,
|
||||
m_total_cpu_time,
|
||||
m_walltime_timer.get_duration(),
|
||||
m_total_samples));
|
||||
|
||||
printer.process_bulk_data(m_state, "nv/cpu_only/sample_times", "sample_times", m_cpu_times);
|
||||
}
|
||||
}
|
||||
|
||||
void measure_cpu_only_base::check_skip_time(nvbench::float64_t warmup_time)
|
||||
{
|
||||
if (m_skip_time > 0. && warmup_time < m_skip_time)
|
||||
{
|
||||
auto reason = fmt::format("Warmup time did not meet skip_time limit: "
|
||||
"{:0.3f}us < {:0.3f}us.",
|
||||
warmup_time * 1e6,
|
||||
m_skip_time * 1e6);
|
||||
|
||||
m_state.skip(reason);
|
||||
NVBENCH_THROW(std::runtime_error, "{}", std::move(reason));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace nvbench::detail
|
||||
@@ -24,15 +24,16 @@
|
||||
#endif // NVBENCH_STATE_EXEC_GUARD
|
||||
|
||||
#include <nvbench/config.cuh>
|
||||
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
|
||||
#include <nvbench/detail/measure_cold.cuh>
|
||||
#include <nvbench/detail/measure_cpu_only.cuh>
|
||||
#include <nvbench/detail/measure_hot.cuh>
|
||||
#include <nvbench/exec_tag.cuh>
|
||||
#include <nvbench/state.cuh>
|
||||
|
||||
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
#include <nvbench/detail/measure_cupti.cuh>
|
||||
#endif // NVBENCH_HAS_CUPTI
|
||||
#include <nvbench/detail/measure_cold.cuh>
|
||||
#include <nvbench/detail/measure_hot.cuh>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
@@ -45,21 +46,44 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
using KL = typename std::remove_reference<KernelLauncher>::type;
|
||||
using namespace nvbench::exec_tag::impl;
|
||||
static_assert(is_exec_tag_v<ExecTags>,
|
||||
"`ExecTags` argument must be a member (or combination of "
|
||||
"members) from nvbench::exec_tag.");
|
||||
"`ExecTags` argument must be a member (or combination of members) from "
|
||||
"`nvbench::exec_tag`.");
|
||||
static_assert(!((tags & gpu) && (tags & no_gpu)),
|
||||
"`nvbench::exec_tag::gpu` and `nvbench::exec_tag::no_gpu` are mutually "
|
||||
"exclusive.");
|
||||
|
||||
constexpr auto measure_tags = tags & measure_mask;
|
||||
constexpr auto modifier_tags = tags & modifier_mask;
|
||||
constexpr auto measure_tags = tags & measure_mask;
|
||||
|
||||
// "run once" is handled by the cold measurement:
|
||||
if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
|
||||
{
|
||||
throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
|
||||
"`set_is_cpu_only(true)` is called when defining the benchmark.");
|
||||
}
|
||||
|
||||
if ((modifier_tags & gpu) && this->get_is_cpu_only())
|
||||
{
|
||||
throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
|
||||
"`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
|
||||
}
|
||||
|
||||
// "run once" should disable batch measurements:
|
||||
// TODO This should just be a runtime branch in measure_cold. Currently this causes two versions
|
||||
// of measure_cold to be compiled. We don't expose the `run_once` tag to users, it should be
|
||||
// removed.
|
||||
// TODO CPU measurements should support run_once as well.
|
||||
if (!(modifier_tags & run_once) && this->get_run_once())
|
||||
{
|
||||
constexpr auto run_once_tags = modifier_tags | cold | run_once;
|
||||
constexpr auto run_once_tags = modifier_tags | run_once | (measure_tags & ~hot);
|
||||
this->exec(run_once_tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
return;
|
||||
}
|
||||
|
||||
if (!(modifier_tags & no_block) && this->get_disable_blocking_kernel())
|
||||
// TODO The `no_block` tag should be removed and replaced with a runtime branch in measure_cold
|
||||
// and measure_hot. Currently this causes unnecesaary codegen. Note that the `sync` exec_tag
|
||||
// implies `no_block` when refactoring.
|
||||
if (!(measure_tags & cpu_only) && !(modifier_tags & no_block) &&
|
||||
this->get_disable_blocking_kernel())
|
||||
{
|
||||
constexpr auto no_block_tags = tags | no_block;
|
||||
this->exec(no_block_tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
@@ -69,14 +93,32 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
// If no measurements selected, pick some defaults based on the modifiers:
|
||||
if constexpr (!measure_tags)
|
||||
{
|
||||
if constexpr (modifier_tags & (timer | sync))
|
||||
{ // Can't do hot timings with manual timer or sync; whole point is to not
|
||||
// sync in between executions.
|
||||
this->exec(cold | tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
}
|
||||
else
|
||||
if constexpr (modifier_tags & gpu)
|
||||
{
|
||||
this->exec(cold | hot | tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
if constexpr (modifier_tags & no_batch)
|
||||
{
|
||||
this->exec(cold | modifier_tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
}
|
||||
else
|
||||
{
|
||||
this->exec(cold | hot | modifier_tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
}
|
||||
}
|
||||
else if constexpr (modifier_tags & no_gpu)
|
||||
{
|
||||
this->exec(cpu_only | modifier_tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
}
|
||||
else // Instantiate both CPU and GPU measurement code:
|
||||
{
|
||||
if constexpr (modifier_tags & no_batch)
|
||||
{
|
||||
this->exec(cold | cpu_only | modifier_tags, std::forward<KernelLauncher>(kernel_launcher));
|
||||
}
|
||||
else
|
||||
{
|
||||
this->exec(cold | hot | cpu_only | modifier_tags,
|
||||
std::forward<KernelLauncher>(kernel_launcher));
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -86,62 +128,90 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
|
||||
return;
|
||||
}
|
||||
|
||||
// Each measurement is deliberately isolated in constexpr branches to
|
||||
// avoid instantiating unused measurements.
|
||||
if constexpr (tags & cold)
|
||||
if (this->get_is_cpu_only())
|
||||
{
|
||||
constexpr bool use_blocking_kernel = !(tags & no_block);
|
||||
if constexpr (tags & timer)
|
||||
if constexpr (tags & cpu_only) // Prevent instantiation when not needed
|
||||
{
|
||||
static_assert(!(tags & gpu), "CPU-only measurement doesn't support the `gpu` exec_tag.");
|
||||
|
||||
if constexpr (tags & timer)
|
||||
{
|
||||
using measure_t = nvbench::detail::measure_cpu_only<KL>;
|
||||
measure_t measure{*this, kernel_launcher};
|
||||
measure();
|
||||
}
|
||||
else
|
||||
{ // Need to wrap the kernel launcher with a timer wrapper:
|
||||
using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
|
||||
wrapper_t wrapper{kernel_launcher};
|
||||
|
||||
using measure_t = nvbench::detail::measure_cpu_only<wrapper_t>;
|
||||
measure_t measure(*this, wrapper);
|
||||
measure();
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr (tags & cold) // Prevent instantiation when not needed
|
||||
{
|
||||
static_assert(!(tags & no_gpu), "Cold measurement doesn't support the `no_gpu` exec_tag.");
|
||||
|
||||
constexpr bool use_blocking_kernel = !(tags & no_block);
|
||||
if constexpr (tags & timer)
|
||||
{
|
||||
// Estimate bandwidth here
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
if constexpr (!(modifier_tags & run_once))
|
||||
{
|
||||
if (this->is_cupti_required())
|
||||
if constexpr (!(modifier_tags & run_once))
|
||||
{
|
||||
using measure_t = nvbench::detail::measure_cupti<KL>;
|
||||
measure_t measure{*this, kernel_launcher};
|
||||
measure();
|
||||
if (this->is_cupti_required())
|
||||
{
|
||||
using measure_t = nvbench::detail::measure_cupti<KL>;
|
||||
measure_t measure{*this, kernel_launcher};
|
||||
measure();
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
|
||||
using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
|
||||
measure_t measure{*this, kernel_launcher};
|
||||
measure();
|
||||
}
|
||||
else
|
||||
{ // Need to wrap the kernel launcher with a timer wrapper:
|
||||
using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
|
||||
wrapper_t wrapper{kernel_launcher};
|
||||
|
||||
// Estimate bandwidth here
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
if constexpr (!(modifier_tags & run_once))
|
||||
{
|
||||
if (this->is_cupti_required())
|
||||
{
|
||||
using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
|
||||
measure_t measure{*this, wrapper};
|
||||
measure();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
|
||||
measure_t measure(*this, wrapper);
|
||||
measure();
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (tags & hot) // Prevent instantiation when not needed
|
||||
{
|
||||
static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
|
||||
static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
|
||||
static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
|
||||
static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");
|
||||
constexpr bool use_blocking_kernel = !(tags & no_block);
|
||||
using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
|
||||
measure_t measure{*this, kernel_launcher};
|
||||
measure();
|
||||
}
|
||||
else
|
||||
{ // Need to wrap the kernel launcher with a timer wrapper:
|
||||
using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
|
||||
wrapper_t wrapper{kernel_launcher};
|
||||
|
||||
// Estimate bandwidth here
|
||||
#ifdef NVBENCH_HAS_CUPTI
|
||||
if constexpr (!(modifier_tags & run_once))
|
||||
{
|
||||
if (this->is_cupti_required())
|
||||
{
|
||||
using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
|
||||
measure_t measure{*this, wrapper};
|
||||
measure();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
|
||||
measure_t measure(*this, wrapper);
|
||||
measure();
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (tags & hot)
|
||||
{
|
||||
static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
|
||||
static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
|
||||
constexpr bool use_blocking_kernel = !(tags & no_block);
|
||||
using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
|
||||
measure_t measure{*this, kernel_launcher};
|
||||
measure();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,12 +35,17 @@ enum class exec_flag
|
||||
no_block = 0x02, // Disables use of `blocking_kernel`.
|
||||
sync = 0x04, // KernelLauncher has indicated that it will sync
|
||||
run_once = 0x08, // Only run the benchmark once (for profiling).
|
||||
modifier_mask = timer | no_block | sync | run_once,
|
||||
gpu = 0x10, // Don't instantiate `measure_cpu_only`.
|
||||
no_gpu = 0x20, // No GPU measurements should be instantiated.
|
||||
no_batch = 0x40, // `measure_hot` will not be used.
|
||||
modifier_mask = 0xFF,
|
||||
|
||||
// Measurement types:
|
||||
// Measurement types to instantiate. Derived from modifiers.
|
||||
// Should not be exposed directly via nvbench::exec_tag::<...>.
|
||||
cold = 0x0100, // measure_cold
|
||||
hot = 0x0200, // measure_hot
|
||||
measure_mask = cold | hot
|
||||
cpu_only = 0x0400, // measure_cpu_only
|
||||
measure_mask = 0xFF00,
|
||||
};
|
||||
|
||||
} // namespace nvbench::detail
|
||||
@@ -95,9 +100,14 @@ using timer_t = tag<nvbench::detail::exec_flag::timer>;
|
||||
using no_block_t = tag<nvbench::detail::exec_flag::no_block>;
|
||||
using sync_t = tag<nvbench::detail::exec_flag::sync>;
|
||||
using run_once_t = tag<nvbench::detail::exec_flag::run_once>;
|
||||
using gpu_t = tag<nvbench::detail::exec_flag::gpu>;
|
||||
using no_gpu_t = tag<nvbench::detail::exec_flag::no_gpu>;
|
||||
using no_batch_t = tag<nvbench::detail::exec_flag::no_batch>;
|
||||
using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;
|
||||
|
||||
using hot_t = tag<nvbench::detail::exec_flag::hot>;
|
||||
using cold_t = tag<nvbench::detail::exec_flag::cold>;
|
||||
using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;
|
||||
using cpu_only_t = tag<nvbench::detail::exec_flag::cpu_only>;
|
||||
using measure_mask_t = tag<nvbench::detail::exec_flag::measure_mask>;
|
||||
|
||||
constexpr inline none_t none;
|
||||
@@ -105,9 +115,14 @@ constexpr inline timer_t timer;
|
||||
constexpr inline no_block_t no_block;
|
||||
constexpr inline sync_t sync;
|
||||
constexpr inline run_once_t run_once;
|
||||
constexpr inline gpu_t gpu;
|
||||
constexpr inline no_gpu_t no_gpu;
|
||||
constexpr inline no_batch_t no_batch;
|
||||
constexpr inline modifier_mask_t modifier_mask;
|
||||
|
||||
constexpr inline cold_t cold;
|
||||
constexpr inline hot_t hot;
|
||||
constexpr inline modifier_mask_t modifier_mask;
|
||||
constexpr inline cpu_only_t cpu_only;
|
||||
constexpr inline measure_mask_t measure_mask;
|
||||
|
||||
} // namespace impl
|
||||
@@ -116,13 +131,26 @@ constexpr inline auto none = nvbench::exec_tag::impl::none;
|
||||
|
||||
/// Modifier used when only a portion of the KernelLauncher needs to be timed.
|
||||
/// Useful for resetting state in-between timed kernel launches.
|
||||
constexpr inline auto timer = nvbench::exec_tag::impl::timer;
|
||||
constexpr inline auto timer = nvbench::exec_tag::impl::timer | //
|
||||
nvbench::exec_tag::impl::no_batch;
|
||||
|
||||
/// Modifier used to indicate that the KernelGenerator will perform CUDA
|
||||
/// synchronizations. Without this flag such benchmarks will deadlock.
|
||||
constexpr inline auto sync = nvbench::exec_tag::impl::no_block | nvbench::exec_tag::impl::sync;
|
||||
constexpr inline auto sync = nvbench::exec_tag::impl::no_block | //
|
||||
nvbench::exec_tag::impl::sync | //
|
||||
nvbench::exec_tag::impl::no_batch;
|
||||
|
||||
/// Modifier used to indicate that batched measurements should be disabled
|
||||
constexpr inline auto no_batch = nvbench::exec_tag::impl::cold;
|
||||
constexpr inline auto no_batch = nvbench::exec_tag::impl::no_batch;
|
||||
|
||||
/// Optional optimization for CPU-only benchmarks. Requires that `set_is_cpu_only(true)`
|
||||
/// is called when defining the benchmark. Passing this exec_tag will ensure that
|
||||
/// GPU measurement code is not instantiated.
|
||||
constexpr inline auto no_gpu = nvbench::exec_tag::impl::no_gpu;
|
||||
|
||||
/// Optional optimization for GPU benchmarks. Requires that `set_is_cpu_only(true)`
|
||||
/// is NOT called when defining the benchmark. Passing this exec_tag will prevent unused CPU-only
|
||||
/// measurement code from being instantiated.
|
||||
constexpr inline auto gpu = nvbench::exec_tag::impl::gpu;
|
||||
|
||||
} // namespace nvbench::exec_tag
|
||||
|
||||
@@ -227,7 +227,7 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_
|
||||
const auto &devices = bench.get_devices();
|
||||
const auto &axes = bench.get_axes();
|
||||
|
||||
fmt::format_to(std::back_inserter(buffer), "\n## {}\n", bench.get_name());
|
||||
fmt::format_to(std::back_inserter(buffer), "\n## {}\n\n", bench.get_name());
|
||||
|
||||
// Do a single pass when no devices are specified. This happens for
|
||||
// benchmarks with `cpu` exec_tags.
|
||||
@@ -240,7 +240,10 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_
|
||||
|
||||
if (device)
|
||||
{
|
||||
fmt::format_to(std::back_inserter(buffer), "\n### [{}] {}\n\n", device->get_id(), device->get_name());
|
||||
fmt::format_to(std::back_inserter(buffer),
|
||||
"### [{}] {}\n\n",
|
||||
device->get_id(),
|
||||
device->get_name());
|
||||
}
|
||||
|
||||
std::size_t row = 0;
|
||||
|
||||
@@ -799,7 +799,10 @@ try
|
||||
else
|
||||
{
|
||||
benchmark_base &bench = *m_benchmarks.back();
|
||||
bench.set_devices(device_vec);
|
||||
if (!bench.get_is_cpu_only())
|
||||
{
|
||||
bench.set_devices(device_vec);
|
||||
}
|
||||
}
|
||||
|
||||
m_recent_devices = std::move(device_vec);
|
||||
|
||||
@@ -71,6 +71,12 @@ struct state
|
||||
/// nullopt for CPU-only benchmarks.
|
||||
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const { return m_device; }
|
||||
|
||||
/// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
|
||||
/// @{
|
||||
// No setter, this should not be modified after construction, as it is a benchmark-wide property.
|
||||
[[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
|
||||
/// @}
|
||||
|
||||
/// An index into a benchmark::type_configs type_list. Returns 0 if no type
|
||||
/// axes in the associated benchmark.
|
||||
[[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; }
|
||||
@@ -293,6 +299,7 @@ private:
|
||||
std::optional<nvbench::device_info> m_device;
|
||||
std::size_t m_type_config_index{};
|
||||
|
||||
bool m_is_cpu_only{false};
|
||||
bool m_run_once{false};
|
||||
bool m_disable_blocking_kernel{false};
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ namespace nvbench
|
||||
|
||||
state::state(const benchmark_base &bench)
|
||||
: m_benchmark{bench}
|
||||
, m_is_cpu_only(bench.get_is_cpu_only())
|
||||
, m_run_once{bench.get_run_once()}
|
||||
, m_disable_blocking_kernel{bench.get_disable_blocking_kernel()}
|
||||
, m_criterion_params{bench.get_criterion_params()}
|
||||
@@ -51,6 +52,7 @@ state::state(const benchmark_base &bench,
|
||||
, m_axis_values{std::move(values)}
|
||||
, m_device{std::move(device)}
|
||||
, m_type_config_index{type_config_index}
|
||||
, m_is_cpu_only(bench.get_is_cpu_only())
|
||||
, m_run_once{bench.get_run_once()}
|
||||
, m_disable_blocking_kernel{bench.get_disable_blocking_kernel()}
|
||||
, m_criterion_params{bench.get_criterion_params()}
|
||||
|
||||
Reference in New Issue
Block a user