Formatting updates.

This commit is contained in:
Allison Piper
2025-04-14 17:26:12 +00:00
parent de36f1a248
commit 3440855dbd
107 changed files with 808 additions and 967 deletions

View File

@@ -24,37 +24,33 @@
template <int ItemsPerThread>
__global__ void kernel(std::size_t stride,
std::size_t elements,
const nvbench::int32_t * __restrict__ in,
const nvbench::int32_t *__restrict__ in,
nvbench::int32_t *__restrict__ out)
{
const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const std::size_t step = gridDim.x * blockDim.x;
for (std::size_t i = stride * tid;
i < stride * elements;
i += stride * step)
for (std::size_t i = stride * tid; i < stride * elements; i += stride * step)
{
for (int j = 0; j < ItemsPerThread; j++)
{
const auto read_id = (ItemsPerThread * i + j) % elements;
const auto read_id = (ItemsPerThread * i + j) % elements;
const auto write_id = tid + j * elements;
out[write_id] = in[read_id];
out[write_id] = in[read_id];
}
}
}
// `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
// and cache hit rates.
//
// Calling state.collect_*() enables particular metric collection if nvbench
// was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
template <int ItemsPerThread>
void throughput_bench(nvbench::state &state,
nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
{
// Allocate input data:
const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(elements);
thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
@@ -72,12 +68,11 @@ void throughput_bench(nvbench::state &state,
static_cast<int>((elements + threads_in_block - 1) / threads_in_block);
state.exec([&](nvbench::launch &launch) {
kernel<ItemsPerThread>
<<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
stride,
elements,
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()));
kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
stride,
elements,
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()));
});
}

View File

@@ -71,18 +71,16 @@ void copy_sweep_grid_shape(nvbench::state &state)
thrust::device_vector<nvbench::int32_t> in(num_values, 0);
thrust::device_vector<nvbench::int32_t> out(num_values, 0);
state.exec(
[block_size,
num_blocks,
num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void) num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
in_ptr,
out_ptr,
num_values);
});
state.exec([block_size,
num_blocks,
num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(in_ptr,
out_ptr,
num_values);
});
}
NVBENCH_BENCH(copy_sweep_grid_shape)
// Every second power of two from 64->1024:
@@ -107,15 +105,12 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
thrust::device_vector<ValueType> in(num_values, 0);
thrust::device_vector<ValueType> out(num_values, 0);
state.exec(
[num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void) num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
out_ptr,
num_values);
});
state.exec([num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
});
}
// Define a type_list to use for the type axis:
using cts_types = nvbench::type_list<nvbench::uint8_t,
@@ -131,11 +126,10 @@ NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
// Convert 64 MiB of InputTypes to OutputTypes, represented with various
// value_types.
template <typename InputType, typename OutputType>
void copy_type_conversion_sweep(nvbench::state &state,
nvbench::type_list<InputType, OutputType>)
void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
{
// Optional: Skip narrowing conversions.
if constexpr(sizeof(InputType) > sizeof(OutputType))
if constexpr (sizeof(InputType) > sizeof(OutputType))
{
state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
return;
@@ -154,15 +148,12 @@ void copy_type_conversion_sweep(nvbench::state &state,
thrust::device_vector<InputType> in(num_values, 0);
thrust::device_vector<OutputType> out(num_values, 0);
state.exec(
[num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void) num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
out_ptr,
num_values);
});
state.exec([num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
});
}
// Optional: Skip when InputType == OutputType. This approach avoids
// instantiating the benchmark at all.
@@ -178,6 +169,5 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
nvbench::float32_t,
nvbench::int64_t,
nvbench::float64_t>;
NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
.set_type_axes_names({"In", "Out"});

View File

@@ -36,10 +36,7 @@ public:
protected:
// Setup the criterion in the `do_initialize()` method:
virtual void do_initialize() override
{
m_num_samples = 0;
}
virtual void do_initialize() override { m_num_samples = 0; }
// Process new measurements in the `add_measurement()` method:
virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
@@ -52,7 +49,6 @@ protected:
{
return m_num_samples >= m_params.get_int64("max-samples");
}
};
// Register the criterion with NVBench:
@@ -71,7 +67,7 @@ void throughput_bench(nvbench::state &state)
state.add_global_memory_writes<nvbench::int32_t>(num_values);
state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
(void) num_values; // clang thinks this is unused...
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),

View File

@@ -17,7 +17,6 @@
*/
#include <nvbench/nvbench.cuh>
#include <nvbench/test_kernels.cuh>
// Enum to use as parameter axis:
@@ -68,12 +67,10 @@ void runtime_enum_sweep_string(nvbench::state &state)
// Create inputs, etc, configure runtime kernel parameters, etc.
// Just a dummy kernel.
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
NVBENCH_BENCH(runtime_enum_sweep_string)
.add_string_axis("MyEnum", {"A", "B", "C"});
NVBENCH_BENCH(runtime_enum_sweep_string).add_string_axis("MyEnum", {"A", "B", "C"});
//==============================================================================
// Sweep through enum values at runtime using an int64 axis.
@@ -97,9 +94,8 @@ void runtime_enum_sweep_int64(nvbench::state &state)
// Create inputs, etc, configure runtime kernel parameters, etc.
// Just a dummy kernel.
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
NVBENCH_BENCH(runtime_enum_sweep_int64)
.add_int64_axis("MyEnum",
@@ -178,12 +174,10 @@ void compile_time_enum_sweep(nvbench::state &state,
// Template parameters, static dispatch, etc.
// Just a dummy kernel.
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
using MyEnumList =
nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
using MyEnumList = nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
.set_type_axes_names({"MyEnum"});
@@ -199,16 +193,14 @@ NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
// * `-12` (struct std::integral_constant<int,-12>)
// ```
template <nvbench::int32_t IntValue>
void compile_time_int_sweep(nvbench::state &state,
nvbench::type_list<nvbench::enum_type<IntValue>>)
void compile_time_int_sweep(nvbench::state &state, nvbench::type_list<nvbench::enum_type<IntValue>>)
{
// Use IntValue in compile time contexts.
// Template parameters, static dispatch, etc.
// Just a dummy kernel.
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
using MyInts = nvbench::enum_type_list<0, 16, 4096, -12>;
NVBENCH_BENCH_TYPES(compile_time_int_sweep, NVBENCH_TYPE_AXES(MyInts))

View File

@@ -53,9 +53,7 @@ void sequence_bench(nvbench::state &state)
// nvbench::exec_tag::sync indicates that this will implicitly sync:
state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
thrust::sequence(thrust::device.on(launch.get_stream()),
data.begin(),
data.end());
thrust::sequence(thrust::device.on(launch.get_stream()), data.begin(), data.end());
});
}
NVBENCH_BENCH(sequence_bench);

View File

@@ -23,8 +23,8 @@
// Thrust simplifies memory management, etc:
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sequence.h>
// mod2_inplace performs an in-place mod2 over every element in `data`. `data`
@@ -54,7 +54,7 @@ void mod2_inplace(nvbench::state &state)
state.exec(nvbench::exec_tag::timer,
// Lambda now takes a `timer` argument:
[&input, &data, num_values](nvbench::launch &launch, auto &timer) {
(void) num_values; // clang thinks this is unused...
(void)num_values; // clang thinks this is unused...
// Reset working data:
thrust::copy(thrust::device.on(launch.get_stream()),

View File

@@ -72,14 +72,12 @@ NVBENCH_BENCH(runtime_skip)
// Two type axes are swept, but configurations where InputType == OutputType are
// skipped.
template <typename InputType, typename OutputType>
void skip_overload(nvbench::state &state,
nvbench::type_list<InputType, OutputType>)
void skip_overload(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
{
// This is a contrived example that focuses on the skip overloads, so this is
// just a sleep kernel:
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
// Overload of skip_overload that is called when InputType == OutputType.
template <typename T>
@@ -107,9 +105,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
{
// This is a contrived example that focuses on the skip overloads, so this is
// just a sleep kernel:
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
// Enable this overload if InputType is larger than OutputType
template <typename InputType, typename OutputType>
@@ -119,10 +116,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
state.skip("sizeof(InputType) > sizeof(OutputType).");
}
// The same type_list is used for both inputs/outputs.
using sn_types = nvbench::type_list<nvbench::int8_t,
nvbench::int16_t,
nvbench::int32_t,
nvbench::int64_t>;
using sn_types =
nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
// Setup benchmark:
NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
.set_type_axes_names({"In", "Out"});

View File

@@ -52,7 +52,7 @@ void stream_bench(nvbench::state &state)
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
state.exec([&input, &output, num_values](nvbench::launch &) {
(void) num_values; // clang thinks this is unused...
(void)num_values; // clang thinks this is unused...
copy(thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
num_values);

View File

@@ -26,8 +26,8 @@
void summary_example(nvbench::state &state)
{
// Fetch parameters and compute duration in seconds:
const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
const auto duration = ms * 1e-3 + us * 1e-6;
// Add a new column to the summary table with the derived duration used by the benchmark.

View File

@@ -51,7 +51,7 @@ void throughput_bench(nvbench::state &state)
state.add_global_memory_writes<nvbench::int32_t>(num_values);
state.exec([&input, &output, num_values](nvbench::launch &launch) {
(void) num_values; // clang thinks this is unused...
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),