Add the first example.

- New NVBench_ENABLE_EXAMPLES CMake option. - examples/axis.cu provides examples of parameter sweeps. - Moves testing/sleep_kernel.cuh -> nvbench/test_kernels.cuh - Accessible to examples and provides some built-in kernels for users to experiement with. - Not included with `<nvbench/nvbench.cuh>`.
2026-04-20 14:58:54 +00:00 · 2021-03-08 18:20:47 -05:00
parent b01b8fe8dc
commit c133784763
5 changed files with 234 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@ cmake_minimum_required(VERSION 3.18.3)
 project(NVBench CUDA CXX)

 option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
+option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)

 # Setup some vars for CPM packages:
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/")
@@ -44,7 +45,14 @@ endif()

 add_subdirectory(nvbench)

-if (NVBench_ENABLE_TESTING)
+if (NVBench_ENABLE_EXAMPLES OR NVBench_ENABLE_TESTING)
  enable_testing()
+endif()
+
+if (NVBench_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
+
+if (NVBench_ENABLE_TESTING)
  add_subdirectory(testing)
 endif()
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(example_srcs
+  axes.cu
+)
+
+foreach(example_src IN LISTS example_srcs)
+  get_filename_component(example_name "${example_src}" NAME_WLE)
+  string(PREPEND example_name "nvbench.example.")
+  add_executable(${example_name} "${example_src}")
+  target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
+  target_link_libraries(${example_name} PRIVATE nvbench::main)
+  set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
+  set_target_properties(${example_name} PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${NVBench_LIBRARY_OUTPUT_DIR}"
+    LIBRARY_OUTPUT_DIRECTORY "${NVBench_LIBRARY_OUTPUT_DIR}"
+    RUNTIME_OUTPUT_DIRECTORY "${NVBench_EXECUTABLE_OUTPUT_DIR}"
+  )
+  add_test(NAME ${example_name} COMMAND "$<TARGET_FILE:${example_name}>")
+endforeach()
--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -0,0 +1,177 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+#include <type_traits>
+
+//==============================================================================
+// Simple benchmark with no parameter axes:
+void simple(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &launch) {
+    // Sleep for 1 millisecond:
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+  });
+}
+NVBENCH_BENCH(simple);
+
+//==============================================================================
+// Single parameter sweep:
+void single_float64_axis(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec([duration](nvbench::launch &launch) {
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
+  });
+}
+NVBENCH_BENCH(single_float64_axis)
+  // 0 -> 1 ms in 100 us increments.
+  .add_float64_axis("Duration", nvbench::range(0., 1e-3, 1e-4));
+
+//==============================================================================
+// Multiple parameters:
+// Varies block_size and num_blocks while invoking a naive copy of 256 GiB worth
+// of int32_t.
+void copy_sweep_grid_shape(nvbench::state &state)
+{
+  // Get current parameters:
+  const int block_size = static_cast<int>(state.get_int64("BlockSize"));
+  const int num_blocks = static_cast<int>(state.get_int64("NumBlocks"));
+
+  // Number of int32's in 256 MB:
+  const std::size_t num_values = 256 * 1024 * 1024 / sizeof(nvbench::int32_t);
+
+  // Report throughput stats:
+  state.add_element_count(num_values);
+  state.add_global_memory_reads<nvbench::int32_t>(num_values);
+  state.add_global_memory_writes<nvbench::int32_t>(num_values);
+
+  // Allocate device memory:
+  thrust::device_vector<nvbench::int32_t> in(num_values, 0);
+  thrust::device_vector<nvbench::int32_t> out(num_values, 0);
+
+  state.exec(
+    [block_size,
+     num_blocks,
+     num_values,
+     in_ptr  = thrust::raw_pointer_cast(in.data()),
+     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      nvbench::copy_kernel<<<num_blocks, block_size>>>(in_ptr,
+                                                       out_ptr,
+                                                       num_values);
+    });
+}
+NVBENCH_BENCH(copy_sweep_grid_shape)
+  // Alternating powers of two between 64->1024
+  .add_int64_power_of_two_axis("BlockSize", nvbench::range(6, 10, 2))
+  .add_int64_power_of_two_axis("NumBlocks", nvbench::range(6, 10, 2));
+
+//==============================================================================
+// Type parameter sweep:
+// Copy 256 MiB of data, represented with various value_types.
+template <typename ValueType>
+void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
+{
+  // Number of ValueType's in 256 MB:
+  const std::size_t num_values = 256 * 1024 * 1024 / sizeof(ValueType);
+
+  // Report throughput stats:
+  state.add_element_count(num_values);
+  state.add_global_memory_reads<ValueType>(num_values);
+  state.add_global_memory_writes<ValueType>(num_values);
+
+  // Allocate device memory:
+  thrust::device_vector<ValueType> in(num_values, 0);
+  thrust::device_vector<ValueType> out(num_values, 0);
+
+  state.exec(
+    [num_values,
+     in_ptr  = thrust::raw_pointer_cast(in.data()),
+     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      nvbench::copy_kernel<<<256, 256>>>(in_ptr, out_ptr, num_values);
+    });
+}
+// Define a type_list to use for the type axis:
+using cts_types = nvbench::type_list<nvbench::uint8_t,
+                                     nvbench::uint16_t,
+                                     nvbench::uint32_t,
+                                     nvbench::uint64_t,
+                                     nvbench::float32_t,
+                                     nvbench::float64_t>;
+NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
+
+//==============================================================================
+// Type parameter sweep:
+// Convert 64 MiB of InputTypes to OutputTypes, represented with various
+// value_types.
+template <typename InputType, typename OutputType>
+void copy_type_conversion_sweep(nvbench::state &state,
+                                nvbench::type_list<InputType, OutputType>)
+{
+  // Optional: Skip narrowing conversions.
+  // - Still run for lossy same-size int->float.
+  // - This could be done at compile-time with SFINAE to avoid instantiation.
+  if (sizeof(InputType) > sizeof(OutputType))
+  {
+    state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
+  }
+
+  // Number of InputType's in 64 MB:
+  const std::size_t num_values = 64 * 1024 * 1024 / sizeof(InputType);
+
+  // Report throughput stats:
+  state.add_element_count(num_values);
+  state.add_global_memory_reads<InputType>(num_values);
+  state.add_global_memory_writes<OutputType>(num_values);
+
+  // Allocate device memory:
+  thrust::device_vector<InputType> in(num_values, 0);
+  thrust::device_vector<OutputType> out(num_values, 0);
+
+  state.exec(
+    [num_values,
+     in_ptr  = thrust::raw_pointer_cast(in.data()),
+     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      nvbench::copy_kernel<<<256, 256>>>(in_ptr, out_ptr, num_values);
+    });
+}
+// Optional: Skip when InputType == OutputType. This approach avoids
+// instantiating the benchmark at all.
+template <typename T>
+void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<T, T>)
+{
+  state.skip("Not a conversion: InputType == OutputType.");
+}
+// The same type_list is used for both inputs/outputs.
+using ctcs_types = nvbench::type_list<nvbench::int8_t,
+                                      nvbench::int16_t,
+                                      nvbench::int32_t,
+                                      nvbench::float32_t,
+                                      nvbench::int64_t,
+                                      nvbench::float64_t>;
+NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
+                    NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
+  .set_type_axes_names({"In", "Out"});
--- a/nvbench/test_kernels.cuh
+++ b/nvbench/test_kernels.cuh
@@ -22,6 +22,17 @@

 #include <cuda_runtime.h>

+/*!
+ * @file test_kernels.cuh
+ * A collection of simple kernels for testing purposes.
+ */
+
+namespace nvbench
+{
+
+/*!
+ * Each launched thread just sleeps for `seconds`.
+ */
 __global__ void sleep_kernel(double seconds)
 {
  const auto start = cuda::std::chrono::high_resolution_clock::now();
@@ -35,3 +46,20 @@ __global__ void sleep_kernel(double seconds)
    now = cuda::std::chrono::high_resolution_clock::now();
  }
 }
+
+/*!
+ * Naive copy of `n` values from `in` -> `out`.
+ */
+template <typename T, typename U>
+__global__ void copy_kernel(const T* in, U* out, std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    out[i] = static_cast<U>(in[i]);
+  }
+}
+
+}
--- a/testing/cuda_timer.cu
+++ b/testing/cuda_timer.cu
@@ -19,9 +19,9 @@
 #include <nvbench/cuda_timer.cuh>

 #include <nvbench/cuda_stream.cuh>
+#include <nvbench/test_kernels.cuh>
 #include <nvbench/types.cuh>

-#include "sleep_kernel.cuh"
 #include "test_asserts.cuh"

 #include <fmt/format.h>
@@ -35,7 +35,7 @@ void test_basic(cudaStream_t time_stream,
  NVBENCH_CUDA_CALL(cudaDeviceSynchronize());

  timer.start(time_stream);
-  sleep_kernel<<<1, 1, 0, exec_stream>>>(0.25);
+  nvbench::sleep_kernel<<<1, 1, 0, exec_stream>>>(0.25);
  timer.stop(time_stream);

  NVBENCH_CUDA_CALL(cudaDeviceSynchronize());