mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-04-20 14:58:54 +00:00
Formatting updates.
This commit is contained in:
@@ -24,37 +24,33 @@
|
||||
template <int ItemsPerThread>
|
||||
__global__ void kernel(std::size_t stride,
|
||||
std::size_t elements,
|
||||
const nvbench::int32_t * __restrict__ in,
|
||||
const nvbench::int32_t *__restrict__ in,
|
||||
nvbench::int32_t *__restrict__ out)
|
||||
{
|
||||
const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const std::size_t step = gridDim.x * blockDim.x;
|
||||
|
||||
for (std::size_t i = stride * tid;
|
||||
i < stride * elements;
|
||||
i += stride * step)
|
||||
for (std::size_t i = stride * tid; i < stride * elements; i += stride * step)
|
||||
{
|
||||
for (int j = 0; j < ItemsPerThread; j++)
|
||||
{
|
||||
const auto read_id = (ItemsPerThread * i + j) % elements;
|
||||
const auto read_id = (ItemsPerThread * i + j) % elements;
|
||||
const auto write_id = tid + j * elements;
|
||||
out[write_id] = in[read_id];
|
||||
out[write_id] = in[read_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
|
||||
// and cache hit rates.
|
||||
//
|
||||
// Calling state.collect_*() enables particular metric collection if nvbench
|
||||
// was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
|
||||
template <int ItemsPerThread>
|
||||
void throughput_bench(nvbench::state &state,
|
||||
nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
|
||||
void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
|
||||
{
|
||||
// Allocate input data:
|
||||
const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
|
||||
const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
|
||||
const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
|
||||
thrust::device_vector<nvbench::int32_t> input(elements);
|
||||
thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
|
||||
@@ -72,12 +68,11 @@ void throughput_bench(nvbench::state &state,
|
||||
static_cast<int>((elements + threads_in_block - 1) / threads_in_block);
|
||||
|
||||
state.exec([&](nvbench::launch &launch) {
|
||||
kernel<ItemsPerThread>
|
||||
<<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
|
||||
stride,
|
||||
elements,
|
||||
thrust::raw_pointer_cast(input.data()),
|
||||
thrust::raw_pointer_cast(output.data()));
|
||||
kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
|
||||
stride,
|
||||
elements,
|
||||
thrust::raw_pointer_cast(input.data()),
|
||||
thrust::raw_pointer_cast(output.data()));
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -71,18 +71,16 @@ void copy_sweep_grid_shape(nvbench::state &state)
|
||||
thrust::device_vector<nvbench::int32_t> in(num_values, 0);
|
||||
thrust::device_vector<nvbench::int32_t> out(num_values, 0);
|
||||
|
||||
state.exec(
|
||||
[block_size,
|
||||
num_blocks,
|
||||
num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
(void) num_values; // clang thinks this is unused...
|
||||
nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
|
||||
in_ptr,
|
||||
out_ptr,
|
||||
num_values);
|
||||
});
|
||||
state.exec([block_size,
|
||||
num_blocks,
|
||||
num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
(void)num_values; // clang thinks this is unused...
|
||||
nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(in_ptr,
|
||||
out_ptr,
|
||||
num_values);
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(copy_sweep_grid_shape)
|
||||
// Every second power of two from 64->1024:
|
||||
@@ -107,15 +105,12 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
|
||||
thrust::device_vector<ValueType> in(num_values, 0);
|
||||
thrust::device_vector<ValueType> out(num_values, 0);
|
||||
|
||||
state.exec(
|
||||
[num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
(void) num_values; // clang thinks this is unused...
|
||||
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
|
||||
out_ptr,
|
||||
num_values);
|
||||
});
|
||||
state.exec([num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
(void)num_values; // clang thinks this is unused...
|
||||
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
|
||||
});
|
||||
}
|
||||
// Define a type_list to use for the type axis:
|
||||
using cts_types = nvbench::type_list<nvbench::uint8_t,
|
||||
@@ -131,11 +126,10 @@ NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
|
||||
// Convert 64 MiB of InputTypes to OutputTypes, represented with various
|
||||
// value_types.
|
||||
template <typename InputType, typename OutputType>
|
||||
void copy_type_conversion_sweep(nvbench::state &state,
|
||||
nvbench::type_list<InputType, OutputType>)
|
||||
void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
|
||||
{
|
||||
// Optional: Skip narrowing conversions.
|
||||
if constexpr(sizeof(InputType) > sizeof(OutputType))
|
||||
if constexpr (sizeof(InputType) > sizeof(OutputType))
|
||||
{
|
||||
state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
|
||||
return;
|
||||
@@ -154,15 +148,12 @@ void copy_type_conversion_sweep(nvbench::state &state,
|
||||
thrust::device_vector<InputType> in(num_values, 0);
|
||||
thrust::device_vector<OutputType> out(num_values, 0);
|
||||
|
||||
state.exec(
|
||||
[num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
(void) num_values; // clang thinks this is unused...
|
||||
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
|
||||
out_ptr,
|
||||
num_values);
|
||||
});
|
||||
state.exec([num_values,
|
||||
in_ptr = thrust::raw_pointer_cast(in.data()),
|
||||
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
|
||||
(void)num_values; // clang thinks this is unused...
|
||||
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
|
||||
});
|
||||
}
|
||||
// Optional: Skip when InputType == OutputType. This approach avoids
|
||||
// instantiating the benchmark at all.
|
||||
@@ -178,6 +169,5 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
|
||||
nvbench::float32_t,
|
||||
nvbench::int64_t,
|
||||
nvbench::float64_t>;
|
||||
NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
|
||||
NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
|
||||
NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
|
||||
.set_type_axes_names({"In", "Out"});
|
||||
|
||||
@@ -36,10 +36,7 @@ public:
|
||||
|
||||
protected:
|
||||
// Setup the criterion in the `do_initialize()` method:
|
||||
virtual void do_initialize() override
|
||||
{
|
||||
m_num_samples = 0;
|
||||
}
|
||||
virtual void do_initialize() override { m_num_samples = 0; }
|
||||
|
||||
// Process new measurements in the `add_measurement()` method:
|
||||
virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
|
||||
@@ -52,7 +49,6 @@ protected:
|
||||
{
|
||||
return m_num_samples >= m_params.get_int64("max-samples");
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// Register the criterion with NVBench:
|
||||
@@ -71,7 +67,7 @@ void throughput_bench(nvbench::state &state)
|
||||
state.add_global_memory_writes<nvbench::int32_t>(num_values);
|
||||
|
||||
state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
|
||||
(void) num_values; // clang thinks this is unused...
|
||||
(void)num_values; // clang thinks this is unused...
|
||||
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
|
||||
thrust::raw_pointer_cast(input.data()),
|
||||
thrust::raw_pointer_cast(output.data()),
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
*/
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
#include <nvbench/test_kernels.cuh>
|
||||
|
||||
// Enum to use as parameter axis:
|
||||
@@ -68,12 +67,10 @@ void runtime_enum_sweep_string(nvbench::state &state)
|
||||
// Create inputs, etc, configure runtime kernel parameters, etc.
|
||||
|
||||
// Just a dummy kernel.
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
state.exec(
|
||||
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
|
||||
}
|
||||
NVBENCH_BENCH(runtime_enum_sweep_string)
|
||||
.add_string_axis("MyEnum", {"A", "B", "C"});
|
||||
NVBENCH_BENCH(runtime_enum_sweep_string).add_string_axis("MyEnum", {"A", "B", "C"});
|
||||
|
||||
//==============================================================================
|
||||
// Sweep through enum values at runtime using an int64 axis.
|
||||
@@ -97,9 +94,8 @@ void runtime_enum_sweep_int64(nvbench::state &state)
|
||||
// Create inputs, etc, configure runtime kernel parameters, etc.
|
||||
|
||||
// Just a dummy kernel.
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
state.exec(
|
||||
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
|
||||
}
|
||||
NVBENCH_BENCH(runtime_enum_sweep_int64)
|
||||
.add_int64_axis("MyEnum",
|
||||
@@ -178,12 +174,10 @@ void compile_time_enum_sweep(nvbench::state &state,
|
||||
// Template parameters, static dispatch, etc.
|
||||
|
||||
// Just a dummy kernel.
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
state.exec(
|
||||
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
|
||||
}
|
||||
using MyEnumList =
|
||||
nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
|
||||
using MyEnumList = nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
|
||||
NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
|
||||
.set_type_axes_names({"MyEnum"});
|
||||
|
||||
@@ -199,16 +193,14 @@ NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
|
||||
// * `-12` (struct std::integral_constant<int,-12>)
|
||||
// ```
|
||||
template <nvbench::int32_t IntValue>
|
||||
void compile_time_int_sweep(nvbench::state &state,
|
||||
nvbench::type_list<nvbench::enum_type<IntValue>>)
|
||||
void compile_time_int_sweep(nvbench::state &state, nvbench::type_list<nvbench::enum_type<IntValue>>)
|
||||
{
|
||||
// Use IntValue in compile time contexts.
|
||||
// Template parameters, static dispatch, etc.
|
||||
|
||||
// Just a dummy kernel.
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
state.exec(
|
||||
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
|
||||
}
|
||||
using MyInts = nvbench::enum_type_list<0, 16, 4096, -12>;
|
||||
NVBENCH_BENCH_TYPES(compile_time_int_sweep, NVBENCH_TYPE_AXES(MyInts))
|
||||
|
||||
@@ -53,9 +53,7 @@ void sequence_bench(nvbench::state &state)
|
||||
|
||||
// nvbench::exec_tag::sync indicates that this will implicitly sync:
|
||||
state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
|
||||
thrust::sequence(thrust::device.on(launch.get_stream()),
|
||||
data.begin(),
|
||||
data.end());
|
||||
thrust::sequence(thrust::device.on(launch.get_stream()), data.begin(), data.end());
|
||||
});
|
||||
}
|
||||
NVBENCH_BENCH(sequence_bench);
|
||||
|
||||
@@ -23,8 +23,8 @@
|
||||
|
||||
// Thrust simplifies memory management, etc:
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/sequence.h>
|
||||
|
||||
// mod2_inplace performs an in-place mod2 over every element in `data`. `data`
|
||||
@@ -54,7 +54,7 @@ void mod2_inplace(nvbench::state &state)
|
||||
state.exec(nvbench::exec_tag::timer,
|
||||
// Lambda now takes a `timer` argument:
|
||||
[&input, &data, num_values](nvbench::launch &launch, auto &timer) {
|
||||
(void) num_values; // clang thinks this is unused...
|
||||
(void)num_values; // clang thinks this is unused...
|
||||
|
||||
// Reset working data:
|
||||
thrust::copy(thrust::device.on(launch.get_stream()),
|
||||
|
||||
@@ -72,14 +72,12 @@ NVBENCH_BENCH(runtime_skip)
|
||||
// Two type axes are swept, but configurations where InputType == OutputType are
|
||||
// skipped.
|
||||
template <typename InputType, typename OutputType>
|
||||
void skip_overload(nvbench::state &state,
|
||||
nvbench::type_list<InputType, OutputType>)
|
||||
void skip_overload(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
|
||||
{
|
||||
// This is a contrived example that focuses on the skip overloads, so this is
|
||||
// just a sleep kernel:
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
state.exec(
|
||||
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
|
||||
}
|
||||
// Overload of skip_overload that is called when InputType == OutputType.
|
||||
template <typename T>
|
||||
@@ -107,9 +105,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
|
||||
{
|
||||
// This is a contrived example that focuses on the skip overloads, so this is
|
||||
// just a sleep kernel:
|
||||
state.exec([](nvbench::launch &launch) {
|
||||
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
|
||||
});
|
||||
state.exec(
|
||||
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
|
||||
}
|
||||
// Enable this overload if InputType is larger than OutputType
|
||||
template <typename InputType, typename OutputType>
|
||||
@@ -119,10 +116,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
|
||||
state.skip("sizeof(InputType) > sizeof(OutputType).");
|
||||
}
|
||||
// The same type_list is used for both inputs/outputs.
|
||||
using sn_types = nvbench::type_list<nvbench::int8_t,
|
||||
nvbench::int16_t,
|
||||
nvbench::int32_t,
|
||||
nvbench::int64_t>;
|
||||
using sn_types =
|
||||
nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
|
||||
// Setup benchmark:
|
||||
NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
|
||||
.set_type_axes_names({"In", "Out"});
|
||||
|
||||
@@ -52,7 +52,7 @@ void stream_bench(nvbench::state &state)
|
||||
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
|
||||
|
||||
state.exec([&input, &output, num_values](nvbench::launch &) {
|
||||
(void) num_values; // clang thinks this is unused...
|
||||
(void)num_values; // clang thinks this is unused...
|
||||
copy(thrust::raw_pointer_cast(input.data()),
|
||||
thrust::raw_pointer_cast(output.data()),
|
||||
num_values);
|
||||
|
||||
@@ -26,8 +26,8 @@
|
||||
void summary_example(nvbench::state &state)
|
||||
{
|
||||
// Fetch parameters and compute duration in seconds:
|
||||
const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
|
||||
const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
|
||||
const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
|
||||
const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
|
||||
const auto duration = ms * 1e-3 + us * 1e-6;
|
||||
|
||||
// Add a new column to the summary table with the derived duration used by the benchmark.
|
||||
|
||||
@@ -51,7 +51,7 @@ void throughput_bench(nvbench::state &state)
|
||||
state.add_global_memory_writes<nvbench::int32_t>(num_values);
|
||||
|
||||
state.exec([&input, &output, num_values](nvbench::launch &launch) {
|
||||
(void) num_values; // clang thinks this is unused...
|
||||
(void)num_values; // clang thinks this is unused...
|
||||
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
|
||||
thrust::raw_pointer_cast(input.data()),
|
||||
thrust::raw_pointer_cast(output.data()),
|
||||
|
||||
Reference in New Issue
Block a user