From e7cc1e344cc45afa4f98c402425a83ccdb15050c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 7 Oct 2025 12:49:17 -0500 Subject: [PATCH] Add an benchmark example parametrized by typename and integral constant. (#275) * Add an benchmark example parametrized by typename and integral constant. Add a variation of copy_type_sweep kernel, where block size is controlled via integral constant passed as template parameter. * Addressed PR review feedback * Use auto to gridSize * Address PR review change request * Add comment to use ceil_div with CCCL >= 2.8 --- examples/axes.cu | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/examples/axes.cu b/examples/axes.cu index 59139b5..17ddaa9 100644 --- a/examples/axes.cu +++ b/examples/axes.cu @@ -68,7 +68,7 @@ void copy_sweep_grid_shape(nvbench::state &state) state.add_global_memory_writes(num_values); // Allocate device memory: - thrust::device_vector in(num_values, 0); + thrust::device_vector in(num_values, 1); thrust::device_vector out(num_values, 0); state.exec([block_size, @@ -102,7 +102,7 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(num_values); // Allocate device memory: - thrust::device_vector in(num_values, 0); + thrust::device_vector in(num_values, ValueType{17}); thrust::device_vector out(num_values, 0); state.exec([num_values, @@ -171,3 +171,42 @@ using ctcs_types = nvbench::type_list; NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types)) .set_type_axes_names({"In", "Out"}); + +// ================================================================================== +// Passing list of typenames and `enum_type_list` to build cartesian product +// of typenames and integral constants + +// define constant wrapper helper type +template +using cw_t = std::integral_constant; + +template +void copy_type_and_block_size_sweep(nvbench::state &state, + nvbench::type_list>) +{ + const std::size_t nelems = 256 * 1024 * 1024 / sizeof(ValueT); + ValueT fill_value{42}; + thrust::device_vector inp(nelems, fill_value); + thrust::device_vector out(nelems, ValueT{}); + + // use cuda::ceil_div(nelems, BLOCK_DIM) with CCCL 2.8 and newer + const auto gridSize = (nelems + BLOCK_DIM - 1) / BLOCK_DIM; + + const ValueT *inp_p = thrust::raw_pointer_cast(inp.data()); + ValueT *out_p = thrust::raw_pointer_cast(out.data()); + + state.add_element_count(nelems, "ElementCount"); + state.add_global_memory_reads(nelems, "Input"); + state.add_global_memory_writes(nelems, "Output"); + + state.exec([&](nvbench::launch &launch) { + nvbench::copy_kernel<<>>(inp_p, out_p, nelems); + }); +} + +template +using cw_list = nvbench::type_list...>; + +using block_sizes = cw_list<64u, 128u, 196u, 256u, 320u, 512u>; +NVBENCH_BENCH_TYPES(copy_type_and_block_size_sweep, NVBENCH_TYPE_AXES(ctcs_types, block_sizes)) + .set_type_axes_names({"Type", "BlockSize"});