mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
Add more examples.
- exec_tag_timer - exec_tag_sync - skip - throughput
This commit is contained in:
21
README.md
21
README.md
@@ -51,13 +51,22 @@ command-line options for configuring benchmark execution at runtime. See the
|
||||
[CLI overview](docs/cli_help.md)
|
||||
and [CLI axis specification](docs/cli_help_axis.md) for more information.
|
||||
|
||||
## Example Sandbox Project
|
||||
## Examples
|
||||
|
||||
To get started with NVBench, consider trying out
|
||||
the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo). This
|
||||
repository contains a simple CMake project that uses NVBench to build an example
|
||||
benchmark. It's a great way to experiment with the library without a lot of
|
||||
investment.
|
||||
This repository provides a number of [examples](examples/) that demonstrate
|
||||
various NVBench features and usecases:
|
||||
|
||||
- [Runtime and compile-time parameter sweeps](examples/axes.cu)
|
||||
- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
|
||||
- [Skipping benchmark configurations](examples/skip.cu)
|
||||
- [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
|
||||
- [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_sync.cu)
|
||||
|
||||
To get started using NVBench with your own kernels, consider trying out
|
||||
the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo)
|
||||
. `nvbench_demo` provides a simple CMake project that uses NVBench to build an
|
||||
example benchmark. It's a great way to experiment with the library without a lot
|
||||
of investment.
|
||||
|
||||
# License
|
||||
|
||||
|
||||
@@ -223,8 +223,12 @@ state.add_global_memory_reads<InputType>(size);
|
||||
state.add_global_memory_writes<OutputType>(size);
|
||||
```
|
||||
|
||||
For meaningful results, specify the input element count, and include all reads
|
||||
and writes to global memory.
|
||||
In general::
|
||||
- Add only the input element count (no outputs).
|
||||
- Add all reads and writes to global memory.
|
||||
|
||||
More examples can found in [examples/throughput.cu](../examples/throughput.cu).
|
||||
|
||||
|
||||
# Skip Uninteresting / Invalid Benchmarks
|
||||
|
||||
@@ -263,6 +267,8 @@ using Us = nvbench::type_list<...>;
|
||||
NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us));
|
||||
```
|
||||
|
||||
More examples can found in [examples/skip.cu](../examples/skip.cu).
|
||||
|
||||
# Execution Tags For Special Cases
|
||||
|
||||
By default, NVBench assumes that the entire execution time of the
|
||||
@@ -304,6 +310,9 @@ void sync_example(nvbench::state& state)
|
||||
NVBENCH_BENCH(sync_example);
|
||||
```
|
||||
|
||||
See [examples/exec_tag_sync.cu](../examples/exec_tag_sync.cu) for a complete
|
||||
example.
|
||||
|
||||
## Explicit timer mode: `nvbench::exec_tag::timer`
|
||||
|
||||
For some kernels, the working data may need to be reset between launches. This
|
||||
@@ -342,6 +351,9 @@ void timer_example(nvbench::state& state)
|
||||
NVBENCH_BENCH(timer_example);
|
||||
```
|
||||
|
||||
See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete
|
||||
example.
|
||||
|
||||
# Beware: Combinatorial Explosion Is Lurking
|
||||
|
||||
Be very careful of how quickly the configuration space can grow. The following
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
set(example_srcs
|
||||
axes.cu
|
||||
exec_tag_sync.cu
|
||||
exec_tag_timer.cu
|
||||
skip.cu
|
||||
throughput.cu
|
||||
)
|
||||
|
||||
foreach(example_src IN LISTS example_srcs)
|
||||
|
||||
@@ -24,8 +24,6 @@
|
||||
// Thrust vectors simplify memory management:
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
//==============================================================================
|
||||
// Simple benchmark with no parameter axes:
|
||||
void simple(nvbench::state &state)
|
||||
|
||||
58
examples/exec_tag_sync.cu
Normal file
58
examples/exec_tag_sync.cu
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
// Grab some testing kernels from NVBench:
|
||||
#include <nvbench/test_kernels.cuh>
|
||||
|
||||
// Thrust vectors simplify memory management:
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
// Used to initialize input data:
|
||||
#include <thrust/sequence.h>
|
||||
|
||||
// `sequence_bench` measures the execution time of `thrust::sequence`. Since
|
||||
// algorithms in `thrust::` implicitly sync the CUDA device, the
|
||||
// `nvbench::exec_tag::sync` must be passed to `state.exec(...)`.
|
||||
//
|
||||
// By default, NVBench uses some tricks to improve the GPU timing stability.
|
||||
// This provides more accurate results, but will cause a deadlock if the lambda
|
||||
// passed to `state.exec(...)` synchronizes. The `nvbench::exec_tag::sync` tag
|
||||
// tells NVBench to run the benchmark safely.
|
||||
//
|
||||
// This tag will also disable the batch measurements, since the synchronization
|
||||
// will throw off the batch results.
|
||||
void sequence_bench(nvbench::state &state)
|
||||
{
|
||||
// Allocate input data:
|
||||
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
|
||||
thrust::device_vector<nvbench::int32_t> data(num_values);
|
||||
|
||||
// Provide throughput information:
|
||||
state.add_element_count(num_values);
|
||||
state.add_global_memory_writes<nvbench::int32_t>(num_values);
|
||||
|
||||
// nvbench::exec_tag::sync indicates that this will implicitly sync:
|
||||
state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
|
||||
thrust::sequence(thrust::device.on(launch.get_stream()),
|
||||
data.begin(),
|
||||
data.end());
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(sequence_bench);
|
||||
73
examples/exec_tag_timer.cu
Normal file
73
examples/exec_tag_timer.cu
Normal file
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
// Grab some testing kernels from NVBench:
|
||||
#include <nvbench/test_kernels.cuh>
|
||||
|
||||
// Thrust vectors simplify memory management:
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
// Used to initialize input data:
|
||||
#include <thrust/sequence.h>
|
||||
|
||||
// mod2_inplace performs an in-place mod2 over every element in `data`. `data`
|
||||
// is reset to `input` each iteration. A manual timer is requested by passing
|
||||
// `nvbench::exec_tag::timer` to `state.exec(...)`, which is used to only time
|
||||
// the mod2, and not the reset.
|
||||
//
|
||||
// Note that this disables the batch timings, since the reset phase will throw
|
||||
// off the batch results.
|
||||
|
||||
void mod2_inplace(nvbench::state &state)
|
||||
{
|
||||
// Allocate input data:
|
||||
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
|
||||
thrust::device_vector<nvbench::int32_t> input(num_values);
|
||||
thrust::sequence(input.begin(), input.end());
|
||||
|
||||
// Working data buffer:
|
||||
thrust::device_vector<nvbench::int32_t> data(num_values);
|
||||
|
||||
// Provide throughput information:
|
||||
state.add_element_count(num_values);
|
||||
state.add_global_memory_reads<nvbench::int32_t>(num_values);
|
||||
state.add_global_memory_writes<nvbench::int32_t>(num_values);
|
||||
|
||||
// Request timer with `nvbench::exec_tag::timer`:
|
||||
state.exec(nvbench::exec_tag::timer,
|
||||
// Lambda now takes a `timer` argument:
|
||||
[&input, &data, num_values](nvbench::launch &launch, auto &timer) {
|
||||
// Reset working data:
|
||||
data = input;
|
||||
|
||||
// Start timer:
|
||||
timer.start();
|
||||
|
||||
// Run kernel of interest:
|
||||
nvbench::mod2_kernel<<<256, 256, 0, launch.get_stream()>>>(
|
||||
thrust::raw_pointer_cast(input.data()),
|
||||
thrust::raw_pointer_cast(input.data()),
|
||||
num_values);
|
||||
|
||||
// Stop timer:
|
||||
timer.stop();
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(mod2_inplace);
|
||||
128
examples/skip.cu
Normal file
128
examples/skip.cu
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
// Grab some testing kernels from NVBench:
|
||||
#include <nvbench/test_kernels.cuh>
|
||||
|
||||
// Thrust vectors simplify memory management:
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
// std::enable_if_t
|
||||
#include <type_traits>
|
||||
|
||||
//==============================================================================
|
||||
// `runtime_skip` demonstrates how to skip benchmarks at runtime.
|
||||
//
|
||||
// Two parameter axes are swept (see axes.cu), but some configurations are
|
||||
// skipped by calling `state.skip` with a skip reason string. This reason
|
||||
// is printed to the log and captured in JSON output.
|
||||
void runtime_skip(nvbench::state &state)
|
||||
{
|
||||
const auto duration = state.get_float64("Duration");
|
||||
const auto kramble = state.get_string("Kramble");
|
||||
|
||||
// Skip Baz benchmarks with < 0.8 ms duration.
|
||||
if (kramble == "Baz" && duration < 0.8e-3)
|
||||
{
|
||||
state.skip("Short 'Baz' benchmarks are skipped.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Skip Foo benchmarks with > 0.3 ms duration.
|
||||
if (kramble == "Foo" && duration > 0.3e-3)
|
||||
{
|
||||
state.skip("Long 'Foo' benchmarks are skipped.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Run all others:
|
||||
state.exec([duration](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(runtime_skip)
|
||||
// 0, 0.25, 0.5, 0.75, and 1.0 milliseconds
|
||||
.add_float64_axis("Duration",
|
||||
nvbench::range(0.,
|
||||
1.1e-3, // .1e-3 slop for fp precision
|
||||
0.25e-3))
|
||||
.add_string_axis("Kramble", {"Foo", "Bar", "Baz"});
|
||||
|
||||
//==============================================================================
|
||||
// `skip_overload` demonstrates how to skip benchmarks at compile-time via
|
||||
// overload resolution.
|
||||
//
|
||||
// Two type axes are swept, but configurations where InputType == OutputType are
|
||||
// skipped.
|
||||
template <typename InputType, typename OutputType>
|
||||
void skip_overload(nvbench::state &state,
|
||||
nvbench::type_list<InputType, OutputType>)
|
||||
{
|
||||
// This is a contrived example that focuses on the skip overloads, so this is
|
||||
// just a sleep kernel:
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
}
|
||||
// Overload of skip_overload that is called when InputType == OutputType.
|
||||
template <typename T>
|
||||
void skip_overload(nvbench::state &state, nvbench::type_list<T, T>)
|
||||
{
|
||||
state.skip("InputType == OutputType.");
|
||||
}
|
||||
// The same type_list is used for both inputs/outputs.
|
||||
using sst_types = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
|
||||
// Setup benchmark:
|
||||
NVBENCH_BENCH_TYPES(skip_overload, NVBENCH_TYPE_AXES(sst_types, sst_types))
|
||||
.set_type_axes_names({"In", "Out"});
|
||||
|
||||
//==============================================================================
|
||||
// `skip_sfinae` demonstrates how to skip benchmarks at compile-time using
|
||||
// SFINAE to handle more complex skip conditions.
|
||||
//
|
||||
// Two type axes are swept, but configurations where sizeof(InputType) >
|
||||
// sizeof(OutputType) are skipped.
|
||||
|
||||
// Enable this overload if InputType is not larger than OutputType
|
||||
template <typename InputType, typename OutputType>
|
||||
std::enable_if_t<(sizeof(InputType) <= sizeof(OutputType)), void>
|
||||
skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
|
||||
{
|
||||
// This is a contrived example that focuses on the skip overloads, so this is
|
||||
// just a sleep kernel:
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
}
|
||||
// Enable this overload if InputType is larger than OutputType
|
||||
template <typename InputType, typename OutputType>
|
||||
std::enable_if_t<(sizeof(InputType) > sizeof(OutputType)), void>
|
||||
skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
|
||||
{
|
||||
state.skip("sizeof(InputType) > sizeof(OutputType).");
|
||||
}
|
||||
// The same type_list is used for both inputs/outputs.
|
||||
using sn_types = nvbench::type_list<nvbench::int8_t,
|
||||
nvbench::int16_t,
|
||||
nvbench::int32_t,
|
||||
nvbench::int64_t>;
|
||||
// Setup benchmark:
|
||||
NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
|
||||
.set_type_axes_names({"In", "Out"});
|
||||
60
examples/throughput.cu
Normal file
60
examples/throughput.cu
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright 2021 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
// Grab some testing kernels from NVBench:
|
||||
#include <nvbench/test_kernels.cuh>
|
||||
|
||||
// Thrust vectors simplify memory management:
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
// `throughput_bench` copies a 64 MiB buffer of int32_t, and reports throughput
|
||||
// in a variety of ways.
|
||||
//
|
||||
// Calling `state.add_element_count(num_elements)` with the number of input
|
||||
// items will report the item throughput rate in elements-per-second.
|
||||
//
|
||||
// Calling `state.add_global_memory_reads<T>(num_elements)` and/or
|
||||
// `state.add_global_memory_writes<T>(num_elements)` will report global device
|
||||
// memory throughput as a percentage of the current device's peak global memory
|
||||
// bandwidth, and also in bytes-per-second.
|
||||
//
|
||||
// All of these methods take an optional second `column_name` argument, which
|
||||
// will add a new column to the output with the reported element count / buffer
|
||||
// size and column name.
|
||||
void throughput_bench(nvbench::state &state)
|
||||
{
|
||||
// Allocate input data:
|
||||
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
|
||||
thrust::device_vector<nvbench::int32_t> input(num_values);
|
||||
thrust::device_vector<nvbench::int32_t> output(num_values);
|
||||
|
||||
// Provide throughput information:
|
||||
state.add_element_count(num_values, "NumElements");
|
||||
state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize");
|
||||
state.add_global_memory_writes<nvbench::int32_t>(num_values);
|
||||
|
||||
state.exec([&input, &output, num_values](nvbench::launch &launch) {
|
||||
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
|
||||
thrust::raw_pointer_cast(input.data()),
|
||||
thrust::raw_pointer_cast(output.data()),
|
||||
num_values);
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(throughput_bench);
|
||||
@@ -25,6 +25,8 @@
|
||||
/*!
|
||||
* @file test_kernels.cuh
|
||||
* A collection of simple kernels for testing purposes.
|
||||
*
|
||||
* Note that these kernels are written to be short and simple, not performant.
|
||||
*/
|
||||
|
||||
namespace nvbench
|
||||
@@ -62,4 +64,19 @@ __global__ void copy_kernel(const T* in, U* out, std::size_t n)
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* For `i <- [0,n)`, `out[i] = in[i] % 2`.
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
__global__ void mod2_kernel(const T* in, U* out, std::size_t n)
|
||||
{
|
||||
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = blockDim.x * gridDim.x;
|
||||
|
||||
for (auto i = init; i < n; i += step)
|
||||
{
|
||||
out[i] = static_cast<U>(in[i] % 2);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user