/* * Copyright 2021 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 with the LLVM exception * (the "License"); you may not use this file except in compliance with * the License. * * You may obtain a copy of the License at * * http://llvm.org/foundation/relicensing/LICENSE.txt * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include // Grab some testing kernels from NVBench: #include // Thrust vectors simplify memory management: #include //============================================================================== // Simple benchmark with no parameter axes: void simple(nvbench::state &state) { state.exec([](nvbench::launch &launch) { // Sleep for 1 millisecond: nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); }); } NVBENCH_BENCH(simple); //============================================================================== // Single parameter sweep: void single_float64_axis(nvbench::state &state) { const auto duration = state.get_float64("Duration"); state.exec([duration](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration); }); } NVBENCH_BENCH(single_float64_axis) // 0 -> 1 ms in 100 us increments. .add_float64_axis("Duration", nvbench::range(0., 1e-3, 1e-4)); //============================================================================== // Multiple parameters: // Varies block_size and num_blocks while invoking a naive copy of 256 MiB worth // of int32_t. void copy_sweep_grid_shape(nvbench::state &state) { // Get current parameters: const auto block_size = static_cast(state.get_int64("BlockSize")); const auto num_blocks = static_cast(state.get_int64("NumBlocks")); // Number of int32s in 256 MiB: const std::size_t num_values = 256 * 1024 * 1024 / sizeof(nvbench::int32_t); // Report throughput stats: state.add_element_count(num_values); state.add_global_memory_reads(num_values); state.add_global_memory_writes(num_values); // Allocate device memory: thrust::device_vector in(num_values, 1); thrust::device_vector out(num_values, 0); state.exec([block_size, num_blocks, num_values, in_ptr = thrust::raw_pointer_cast(in.data()), out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) { (void)num_values; // clang thinks this is unused... nvbench::copy_kernel<<>>(in_ptr, out_ptr, num_values); }); } NVBENCH_BENCH(copy_sweep_grid_shape) // Every second power of two from 64->1024: .add_int64_power_of_two_axis("BlockSize", nvbench::range(6, 10, 2)) .add_int64_power_of_two_axis("NumBlocks", nvbench::range(6, 10, 2)); //============================================================================== // Type parameter sweep: // Copy 256 MiB of data, represented with various value_types. template void copy_type_sweep(nvbench::state &state, nvbench::type_list) { // Number of ValueTypes in 256 MiB: const std::size_t num_values = 256 * 1024 * 1024 / sizeof(ValueType); // Report throughput stats: state.add_element_count(num_values); state.add_global_memory_reads(num_values); state.add_global_memory_writes(num_values); // Allocate device memory: thrust::device_vector in(num_values, ValueType{17}); thrust::device_vector out(num_values, 0); state.exec([num_values, in_ptr = thrust::raw_pointer_cast(in.data()), out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) { (void)num_values; // clang thinks this is unused... nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values); }); } // Define a type_list to use for the type axis: using cts_types = nvbench::type_list; NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types)); //============================================================================== // Type parameter sweep: // Convert 64 MiB of InputTypes to OutputTypes, represented with various // value_types. template void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list) { // Optional: Skip narrowing conversions. if constexpr (sizeof(InputType) > sizeof(OutputType)) { state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."); return; } // Number of InputTypes in 64 MiB: const std::size_t num_values = 64 * 1024 * 1024 / sizeof(InputType); // Report throughput stats: Passing an optional string adds a column to the // output with the number of items/bytes. state.add_element_count(num_values, "Items"); state.add_global_memory_reads(num_values, "InSize"); state.add_global_memory_writes(num_values, "OutSize"); // Allocate device memory: thrust::device_vector in(num_values, 0); thrust::device_vector out(num_values, 0); state.exec([num_values, in_ptr = thrust::raw_pointer_cast(in.data()), out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) { (void)num_values; // clang thinks this is unused... nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values); }); } // Optional: Skip when InputType == OutputType. This approach avoids // instantiating the benchmark at all. template void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list) { state.skip("Not a conversion: InputType == OutputType."); } // The same type_list is used for both inputs/outputs. using ctcs_types = nvbench::type_list; NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types)) .set_type_axes_names({"In", "Out"}); // ================================================================================== // Passing list of typenames and `enum_type_list` to build cartesian product // of typenames and integral constants // define constant wrapper helper type template using cw_t = std::integral_constant; template void copy_type_and_block_size_sweep(nvbench::state &state, nvbench::type_list>) { const std::size_t nelems = 256 * 1024 * 1024 / sizeof(ValueT); ValueT fill_value{42}; thrust::device_vector inp(nelems, fill_value); thrust::device_vector out(nelems, ValueT{}); // use cuda::ceil_div(nelems, BLOCK_DIM) with CCCL 2.8 and newer const auto gridSize = (nelems + BLOCK_DIM - 1) / BLOCK_DIM; const ValueT *inp_p = thrust::raw_pointer_cast(inp.data()); ValueT *out_p = thrust::raw_pointer_cast(out.data()); state.add_element_count(nelems, "ElementCount"); state.add_global_memory_reads(nelems, "Input"); state.add_global_memory_writes(nelems, "Output"); state.exec([&](nvbench::launch &launch) { nvbench::copy_kernel<<>>(inp_p, out_p, nelems); }); } template using cw_list = nvbench::type_list...>; using block_sizes = cw_list<64u, 128u, 196u, 256u, 320u, 512u>; NVBENCH_BENCH_TYPES(copy_type_and_block_size_sweep, NVBENCH_TYPE_AXES(ctcs_types, block_sizes)) .set_type_axes_names({"Type", "BlockSize"});