Add more examples.

- exec_tag_timer
- exec_tag_sync
- skip
- throughput
This commit is contained in:
Allison Vacanti
2021-03-09 16:02:34 -05:00
parent 3d83fe20ac
commit 3fc75f5ea6
9 changed files with 369 additions and 10 deletions

View File

@@ -1,5 +1,9 @@
set(example_srcs
axes.cu
exec_tag_sync.cu
exec_tag_timer.cu
skip.cu
throughput.cu
)
foreach(example_src IN LISTS example_srcs)

View File

@@ -24,8 +24,6 @@
// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>
#include <type_traits>
//==============================================================================
// Simple benchmark with no parameter axes:
void simple(nvbench::state &state)

58
examples/exec_tag_sync.cu Normal file
View File

@@ -0,0 +1,58 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>
// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>
// Used to initialize input data:
#include <thrust/sequence.h>
// `sequence_bench` measures the execution time of `thrust::sequence`. Since
// algorithms in `thrust::` implicitly sync the CUDA device, the
// `nvbench::exec_tag::sync` must be passed to `state.exec(...)`.
//
// By default, NVBench uses some tricks to improve the GPU timing stability.
// This provides more accurate results, but will cause a deadlock if the lambda
// passed to `state.exec(...)` synchronizes. The `nvbench::exec_tag::sync` tag
// tells NVBench to run the benchmark safely.
//
// This tag will also disable the batch measurements, since the synchronization
// will throw off the batch results.
void sequence_bench(nvbench::state &state)
{
// Allocate input data:
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> data(num_values);
// Provide throughput information:
state.add_element_count(num_values);
state.add_global_memory_writes<nvbench::int32_t>(num_values);
// nvbench::exec_tag::sync indicates that this will implicitly sync:
state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
thrust::sequence(thrust::device.on(launch.get_stream()),
data.begin(),
data.end());
});
}
NVBENCH_BENCH(sequence_bench);

View File

@@ -0,0 +1,73 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>
// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>
// Used to initialize input data:
#include <thrust/sequence.h>
// mod2_inplace performs an in-place mod2 over every element in `data`. `data`
// is reset to `input` each iteration. A manual timer is requested by passing
// `nvbench::exec_tag::timer` to `state.exec(...)`, which is used to only time
// the mod2, and not the reset.
//
// Note that this disables the batch timings, since the reset phase will throw
// off the batch results.
void mod2_inplace(nvbench::state &state)
{
// Allocate input data:
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(num_values);
thrust::sequence(input.begin(), input.end());
// Working data buffer:
thrust::device_vector<nvbench::int32_t> data(num_values);
// Provide throughput information:
state.add_element_count(num_values);
state.add_global_memory_reads<nvbench::int32_t>(num_values);
state.add_global_memory_writes<nvbench::int32_t>(num_values);
// Request timer with `nvbench::exec_tag::timer`:
state.exec(nvbench::exec_tag::timer,
// Lambda now takes a `timer` argument:
[&input, &data, num_values](nvbench::launch &launch, auto &timer) {
// Reset working data:
data = input;
// Start timer:
timer.start();
// Run kernel of interest:
nvbench::mod2_kernel<<<256, 256, 0, launch.get_stream()>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(input.data()),
num_values);
// Stop timer:
timer.stop();
});
}
NVBENCH_BENCH(mod2_inplace);

128
examples/skip.cu Normal file
View File

@@ -0,0 +1,128 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>
// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>
// std::enable_if_t
#include <type_traits>
//==============================================================================
// `runtime_skip` demonstrates how to skip benchmarks at runtime.
//
// Two parameter axes are swept (see axes.cu), but some configurations are
// skipped by calling `state.skip` with a skip reason string. This reason
// is printed to the log and captured in JSON output.
void runtime_skip(nvbench::state &state)
{
const auto duration = state.get_float64("Duration");
const auto kramble = state.get_string("Kramble");
// Skip Baz benchmarks with < 0.8 ms duration.
if (kramble == "Baz" && duration < 0.8e-3)
{
state.skip("Short 'Baz' benchmarks are skipped.");
return;
}
// Skip Foo benchmarks with > 0.3 ms duration.
if (kramble == "Foo" && duration > 0.3e-3)
{
state.skip("Long 'Foo' benchmarks are skipped.");
return;
}
// Run all others:
state.exec([duration](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
});
}
NVBENCH_BENCH(runtime_skip)
// 0, 0.25, 0.5, 0.75, and 1.0 milliseconds
.add_float64_axis("Duration",
nvbench::range(0.,
1.1e-3, // .1e-3 slop for fp precision
0.25e-3))
.add_string_axis("Kramble", {"Foo", "Bar", "Baz"});
//==============================================================================
// `skip_overload` demonstrates how to skip benchmarks at compile-time via
// overload resolution.
//
// Two type axes are swept, but configurations where InputType == OutputType are
// skipped.
template <typename InputType, typename OutputType>
void skip_overload(nvbench::state &state,
nvbench::type_list<InputType, OutputType>)
{
// This is a contrived example that focuses on the skip overloads, so this is
// just a sleep kernel:
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
}
// Overload of skip_overload that is called when InputType == OutputType.
template <typename T>
void skip_overload(nvbench::state &state, nvbench::type_list<T, T>)
{
state.skip("InputType == OutputType.");
}
// The same type_list is used for both inputs/outputs.
using sst_types = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
// Setup benchmark:
NVBENCH_BENCH_TYPES(skip_overload, NVBENCH_TYPE_AXES(sst_types, sst_types))
.set_type_axes_names({"In", "Out"});
//==============================================================================
// `skip_sfinae` demonstrates how to skip benchmarks at compile-time using
// SFINAE to handle more complex skip conditions.
//
// Two type axes are swept, but configurations where sizeof(InputType) >
// sizeof(OutputType) are skipped.
// Enable this overload if InputType is not larger than OutputType
template <typename InputType, typename OutputType>
std::enable_if_t<(sizeof(InputType) <= sizeof(OutputType)), void>
skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
{
// This is a contrived example that focuses on the skip overloads, so this is
// just a sleep kernel:
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
}
// Enable this overload if InputType is larger than OutputType
template <typename InputType, typename OutputType>
std::enable_if_t<(sizeof(InputType) > sizeof(OutputType)), void>
skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
{
state.skip("sizeof(InputType) > sizeof(OutputType).");
}
// The same type_list is used for both inputs/outputs.
using sn_types = nvbench::type_list<nvbench::int8_t,
nvbench::int16_t,
nvbench::int32_t,
nvbench::int64_t>;
// Setup benchmark:
NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
.set_type_axes_names({"In", "Out"});

60
examples/throughput.cu Normal file
View File

@@ -0,0 +1,60 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>
// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>
// `throughput_bench` copies a 64 MiB buffer of int32_t, and reports throughput
// in a variety of ways.
//
// Calling `state.add_element_count(num_elements)` with the number of input
// items will report the item throughput rate in elements-per-second.
//
// Calling `state.add_global_memory_reads<T>(num_elements)` and/or
// `state.add_global_memory_writes<T>(num_elements)` will report global device
// memory throughput as a percentage of the current device's peak global memory
// bandwidth, and also in bytes-per-second.
//
// All of these methods take an optional second `column_name` argument, which
// will add a new column to the output with the reported element count / buffer
// size and column name.
void throughput_bench(nvbench::state &state)
{
// Allocate input data:
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(num_values);
thrust::device_vector<nvbench::int32_t> output(num_values);
// Provide throughput information:
state.add_element_count(num_values, "NumElements");
state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize");
state.add_global_memory_writes<nvbench::int32_t>(num_values);
state.exec([&input, &output, num_values](nvbench::launch &launch) {
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
num_values);
});
}
NVBENCH_BENCH(throughput_bench);