Add more examples.

- exec_tag_timer - exec_tag_sync - skip - throughput
2026-05-11 08:50:03 +00:00 · 2021-03-09 16:02:34 -05:00
parent 3d83fe20ac
commit 3fc75f5ea6
9 changed files with 369 additions and 10 deletions
--- a/README.md
+++ b/README.md
@@ -51,13 +51,22 @@ command-line options for configuring benchmark execution at runtime. See the
 [CLI overview](docs/cli_help.md)
 and [CLI axis specification](docs/cli_help_axis.md) for more information.

-## Example Sandbox Project
+## Examples

-To get started with NVBench, consider trying out
-the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo). This
-repository contains a simple CMake project that uses NVBench to build an example
-benchmark. It's a great way to experiment with the library without a lot of
-investment.
+This repository provides a number of [examples](examples/) that demonstrate
+various NVBench features and usecases:
+
+- [Runtime and compile-time parameter sweeps](examples/axes.cu)
+- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
+- [Skipping benchmark configurations](examples/skip.cu)
+- [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
+- [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_sync.cu)
+
+To get started using NVBench with your own kernels, consider trying out
+the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo)
+. `nvbench_demo` provides a simple CMake project that uses NVBench to build an
+example benchmark. It's a great way to experiment with the library without a lot
+of investment.

 # License

--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -223,8 +223,12 @@ state.add_global_memory_reads<InputType>(size);
 state.add_global_memory_writes<OutputType>(size);
 ```

-For meaningful results, specify the input element count, and include all reads
-and writes to global memory.
+In general::
+- Add only the input element count (no outputs).
+- Add all reads and writes to global memory.
+
+More examples can found in [examples/throughput.cu](../examples/throughput.cu).
+

 # Skip Uninteresting / Invalid Benchmarks

@@ -263,6 +267,8 @@ using Us = nvbench::type_list<...>;
 NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us));
 ```

+More examples can found in [examples/skip.cu](../examples/skip.cu).
+
 # Execution Tags For Special Cases

 By default, NVBench assumes that the entire execution time of the
@@ -304,6 +310,9 @@ void sync_example(nvbench::state& state)
 NVBENCH_BENCH(sync_example);
 ```

+See [examples/exec_tag_sync.cu](../examples/exec_tag_sync.cu) for a complete
+example.
+
 ## Explicit timer mode: `nvbench::exec_tag::timer`

 For some kernels, the working data may need to be reset between launches. This
@@ -342,6 +351,9 @@ void timer_example(nvbench::state& state)
 NVBENCH_BENCH(timer_example);
 ```

+See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete
+example.
+
 # Beware: Combinatorial Explosion Is Lurking

 Be very careful of how quickly the configuration space can grow. The following
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,9 @@
 set(example_srcs
  axes.cu
+  exec_tag_sync.cu
+  exec_tag_timer.cu
+  skip.cu
+  throughput.cu
 )

 foreach(example_src IN LISTS example_srcs)
--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -24,8 +24,6 @@
 // Thrust vectors simplify memory management:
 #include <thrust/device_vector.h>

-#include <type_traits>
-
 //==============================================================================
 // Simple benchmark with no parameter axes:
 void simple(nvbench::state &state)
--- a/examples/exec_tag_sync.cu
+++ b/examples/exec_tag_sync.cu
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+// Used to initialize input data:
+#include <thrust/sequence.h>
+
+// `sequence_bench` measures the execution time of `thrust::sequence`. Since
+// algorithms in `thrust::` implicitly sync the CUDA device, the
+// `nvbench::exec_tag::sync` must be passed to `state.exec(...)`.
+//
+// By default, NVBench uses some tricks to improve the GPU timing stability.
+// This provides more accurate results, but will cause a deadlock if the lambda
+// passed to `state.exec(...)` synchronizes. The `nvbench::exec_tag::sync` tag
+// tells NVBench to run the benchmark safely.
+//
+// This tag will also disable the batch measurements, since the synchronization
+// will throw off the batch results.
+void sequence_bench(nvbench::state &state)
+{
+  // Allocate input data:
+  const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
+  thrust::device_vector<nvbench::int32_t> data(num_values);
+
+  // Provide throughput information:
+  state.add_element_count(num_values);
+  state.add_global_memory_writes<nvbench::int32_t>(num_values);
+
+  // nvbench::exec_tag::sync indicates that this will implicitly sync:
+  state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
+    thrust::sequence(thrust::device.on(launch.get_stream()),
+                     data.begin(),
+                     data.end());
+  });
+}
+NVBENCH_BENCH(sequence_bench);
--- a/examples/exec_tag_timer.cu
+++ b/examples/exec_tag_timer.cu
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+// Used to initialize input data:
+#include <thrust/sequence.h>
+
+// mod2_inplace performs an in-place mod2 over every element in `data`. `data`
+// is reset to `input` each iteration. A manual timer is requested by passing
+// `nvbench::exec_tag::timer` to `state.exec(...)`, which is used to only time
+// the mod2, and not the reset.
+//
+// Note that this disables the batch timings, since the reset phase will throw
+// off the batch results.
+
+void mod2_inplace(nvbench::state &state)
+{
+  // Allocate input data:
+  const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
+  thrust::device_vector<nvbench::int32_t> input(num_values);
+  thrust::sequence(input.begin(), input.end());
+
+  // Working data buffer:
+  thrust::device_vector<nvbench::int32_t> data(num_values);
+
+  // Provide throughput information:
+  state.add_element_count(num_values);
+  state.add_global_memory_reads<nvbench::int32_t>(num_values);
+  state.add_global_memory_writes<nvbench::int32_t>(num_values);
+
+  // Request timer with `nvbench::exec_tag::timer`:
+  state.exec(nvbench::exec_tag::timer,
+             // Lambda now takes a `timer` argument:
+             [&input, &data, num_values](nvbench::launch &launch, auto &timer) {
+               // Reset working data:
+               data = input;
+
+               // Start timer:
+               timer.start();
+
+               // Run kernel of interest:
+               nvbench::mod2_kernel<<<256, 256, 0, launch.get_stream()>>>(
+                 thrust::raw_pointer_cast(input.data()),
+                 thrust::raw_pointer_cast(input.data()),
+                 num_values);
+
+               // Stop timer:
+               timer.stop();
+             });
+}
+NVBENCH_BENCH(mod2_inplace);
--- a/examples/skip.cu
+++ b/examples/skip.cu
@@ -0,0 +1,128 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+// std::enable_if_t
+#include <type_traits>
+
+//==============================================================================
+// `runtime_skip` demonstrates how to skip benchmarks at runtime.
+//
+// Two parameter axes are swept (see axes.cu), but some configurations are
+// skipped by calling `state.skip` with a skip reason string. This reason
+// is printed to the log and captured in JSON output.
+void runtime_skip(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+  const auto kramble  = state.get_string("Kramble");
+
+  // Skip Baz benchmarks with < 0.8 ms duration.
+  if (kramble == "Baz" && duration < 0.8e-3)
+  {
+    state.skip("Short 'Baz' benchmarks are skipped.");
+    return;
+  }
+
+  // Skip Foo benchmarks with > 0.3 ms duration.
+  if (kramble == "Foo" && duration > 0.3e-3)
+  {
+    state.skip("Long 'Foo' benchmarks are skipped.");
+    return;
+  }
+
+  // Run all others:
+  state.exec([duration](nvbench::launch &launch) {
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
+  });
+}
+NVBENCH_BENCH(runtime_skip)
+  // 0, 0.25, 0.5, 0.75, and 1.0 milliseconds
+  .add_float64_axis("Duration",
+                    nvbench::range(0.,
+                                   1.1e-3, // .1e-3 slop for fp precision
+                                   0.25e-3))
+  .add_string_axis("Kramble", {"Foo", "Bar", "Baz"});
+
+//==============================================================================
+// `skip_overload` demonstrates how to skip benchmarks at compile-time via
+// overload resolution.
+//
+// Two type axes are swept, but configurations where InputType == OutputType are
+// skipped.
+template <typename InputType, typename OutputType>
+void skip_overload(nvbench::state &state,
+                   nvbench::type_list<InputType, OutputType>)
+{
+  // This is a contrived example that focuses on the skip overloads, so this is
+  // just a sleep kernel:
+  state.exec([](nvbench::launch &launch) {
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+  });
+}
+// Overload of skip_overload that is called when InputType == OutputType.
+template <typename T>
+void skip_overload(nvbench::state &state, nvbench::type_list<T, T>)
+{
+  state.skip("InputType == OutputType.");
+}
+// The same type_list is used for both inputs/outputs.
+using sst_types = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+// Setup benchmark:
+NVBENCH_BENCH_TYPES(skip_overload, NVBENCH_TYPE_AXES(sst_types, sst_types))
+  .set_type_axes_names({"In", "Out"});
+
+//==============================================================================
+// `skip_sfinae` demonstrates how to skip benchmarks at compile-time using
+// SFINAE to handle more complex skip conditions.
+//
+// Two type axes are swept, but configurations where sizeof(InputType) >
+// sizeof(OutputType) are skipped.
+
+// Enable this overload if InputType is not larger than OutputType
+template <typename InputType, typename OutputType>
+std::enable_if_t<(sizeof(InputType) <= sizeof(OutputType)), void>
+skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
+{
+  // This is a contrived example that focuses on the skip overloads, so this is
+  // just a sleep kernel:
+  state.exec([](nvbench::launch &launch) {
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+  });
+}
+// Enable this overload if InputType is larger than OutputType
+template <typename InputType, typename OutputType>
+std::enable_if_t<(sizeof(InputType) > sizeof(OutputType)), void>
+skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
+{
+  state.skip("sizeof(InputType) > sizeof(OutputType).");
+}
+// The same type_list is used for both inputs/outputs.
+using sn_types = nvbench::type_list<nvbench::int8_t,
+                                    nvbench::int16_t,
+                                    nvbench::int32_t,
+                                    nvbench::int64_t>;
+// Setup benchmark:
+NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
+  .set_type_axes_names({"In", "Out"});
--- a/examples/throughput.cu
+++ b/examples/throughput.cu
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+// `throughput_bench` copies a 64 MiB buffer of int32_t, and reports throughput
+// in a variety of ways.
+//
+// Calling `state.add_element_count(num_elements)` with the number of input
+// items will report the item throughput rate in elements-per-second.
+//
+// Calling `state.add_global_memory_reads<T>(num_elements)` and/or
+// `state.add_global_memory_writes<T>(num_elements)` will report global device
+// memory throughput as a percentage of the current device's peak global memory
+// bandwidth, and also in bytes-per-second.
+//
+// All of these methods take an optional second `column_name` argument, which
+// will add a new column to the output with the reported element count / buffer
+// size and column name.
+void throughput_bench(nvbench::state &state)
+{
+  // Allocate input data:
+  const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
+  thrust::device_vector<nvbench::int32_t> input(num_values);
+  thrust::device_vector<nvbench::int32_t> output(num_values);
+
+  // Provide throughput information:
+  state.add_element_count(num_values, "NumElements");
+  state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize");
+  state.add_global_memory_writes<nvbench::int32_t>(num_values);
+
+  state.exec([&input, &output, num_values](nvbench::launch &launch) {
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
+      thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()),
+      num_values);
+  });
+}
+NVBENCH_BENCH(throughput_bench);
--- a/nvbench/test_kernels.cuh
+++ b/nvbench/test_kernels.cuh
@@ -25,6 +25,8 @@
 /*!
 * @file test_kernels.cuh
 * A collection of simple kernels for testing purposes.
+ *
+ * Note that these kernels are written to be short and simple, not performant.
 */

 namespace nvbench
@@ -62,4 +64,19 @@ __global__ void copy_kernel(const T* in, U* out, std::size_t n)
  }
 }

+/*!
+ * For `i <- [0,n)`, `out[i] = in[i] % 2`.
+ */
+template <typename T, typename U>
+__global__ void mod2_kernel(const T* in, U* out, std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    out[i] = static_cast<U>(in[i] % 2);
+  }
+}
+
 }