Formatting updates.

2026-04-20 14:58:54 +00:00 · 2025-04-14 17:26:12 +00:00
parent de36f1a248
commit 3440855dbd
107 changed files with 808 additions and 967 deletions
--- a/examples/auto_throughput.cu
+++ b/examples/auto_throughput.cu
@@ -24,37 +24,33 @@
 template <int ItemsPerThread>
 __global__ void kernel(std::size_t stride,
                       std::size_t elements,
-                       const nvbench::int32_t * __restrict__ in,
+                       const nvbench::int32_t *__restrict__ in,
                       nvbench::int32_t *__restrict__ out)
 {
-  const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const std::size_t tid  = threadIdx.x + blockIdx.x * blockDim.x;
  const std::size_t step = gridDim.x * blockDim.x;

-  for (std::size_t i = stride * tid;
-       i < stride * elements;
-       i += stride * step)
+  for (std::size_t i = stride * tid; i < stride * elements; i += stride * step)
  {
    for (int j = 0; j < ItemsPerThread; j++)
    {
-      const auto read_id = (ItemsPerThread * i + j) % elements;
+      const auto read_id  = (ItemsPerThread * i + j) % elements;
      const auto write_id = tid + j * elements;
-      out[write_id] = in[read_id];
+      out[write_id]       = in[read_id];
    }
  }
 }

-
 // `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
 // and cache hit rates.
 //
 // Calling state.collect_*() enables particular metric collection if nvbench
 // was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
 template <int ItemsPerThread>
-void throughput_bench(nvbench::state &state,
-                      nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
+void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
 {
  // Allocate input data:
-  const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
+  const std::size_t stride   = static_cast<std::size_t>(state.get_int64("Stride"));
  const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
  thrust::device_vector<nvbench::int32_t> input(elements);
  thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
@@ -72,12 +68,11 @@ void throughput_bench(nvbench::state &state,
    static_cast<int>((elements + threads_in_block - 1) / threads_in_block);

  state.exec([&](nvbench::launch &launch) {
-    kernel<ItemsPerThread>
-      <<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
-        stride,
-        elements,
-        thrust::raw_pointer_cast(input.data()),
-        thrust::raw_pointer_cast(output.data()));
+    kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
+      stride,
+      elements,
+      thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()));
  });
 }

--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -71,18 +71,16 @@ void copy_sweep_grid_shape(nvbench::state &state)
  thrust::device_vector<nvbench::int32_t> in(num_values, 0);
  thrust::device_vector<nvbench::int32_t> out(num_values, 0);

-  state.exec(
-    [block_size,
-     num_blocks,
-     num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
-        in_ptr,
-        out_ptr,
-        num_values);
-    });
+  state.exec([block_size,
+              num_blocks,
+              num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(in_ptr,
+                                                                             out_ptr,
+                                                                             num_values);
+  });
 }
 NVBENCH_BENCH(copy_sweep_grid_shape)
  // Every second power of two from  64->1024:
@@ -107,15 +105,12 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
  thrust::device_vector<ValueType> in(num_values, 0);
  thrust::device_vector<ValueType> out(num_values, 0);

-  state.exec(
-    [num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
-                                                                 out_ptr,
-                                                                 num_values);
-    });
+  state.exec([num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
+  });
 }
 // Define a type_list to use for the type axis:
 using cts_types = nvbench::type_list<nvbench::uint8_t,
@@ -131,11 +126,10 @@ NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
 // Convert 64 MiB of InputTypes to OutputTypes, represented with various
 // value_types.
 template <typename InputType, typename OutputType>
-void copy_type_conversion_sweep(nvbench::state &state,
-                                nvbench::type_list<InputType, OutputType>)
+void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // Optional: Skip narrowing conversions.
-  if constexpr(sizeof(InputType) > sizeof(OutputType))
+  if constexpr (sizeof(InputType) > sizeof(OutputType))
  {
    state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
    return;
@@ -154,15 +148,12 @@ void copy_type_conversion_sweep(nvbench::state &state,
  thrust::device_vector<InputType> in(num_values, 0);
  thrust::device_vector<OutputType> out(num_values, 0);

-  state.exec(
-    [num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
-                                                                 out_ptr,
-                                                                 num_values);
-    });
+  state.exec([num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
+  });
 }
 // Optional: Skip when InputType == OutputType. This approach avoids
 // instantiating the benchmark at all.
@@ -178,6 +169,5 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
                                      nvbench::float32_t,
                                      nvbench::int64_t,
                                      nvbench::float64_t>;
-NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
-                    NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
+NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
  .set_type_axes_names({"In", "Out"});
--- a/examples/custom_criterion.cu
+++ b/examples/custom_criterion.cu
@@ -36,10 +36,7 @@ public:

 protected:
  // Setup the criterion in the `do_initialize()` method:
-  virtual void do_initialize() override
-  {
-    m_num_samples = 0;
-  }
+  virtual void do_initialize() override { m_num_samples = 0; }

  // Process new measurements in the `add_measurement()` method:
  virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
@@ -52,7 +49,6 @@ protected:
  {
    return m_num_samples >= m_params.get_int64("max-samples");
  }
-
 };

 // Register the criterion with NVBench:
@@ -71,7 +67,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);

  state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),
--- a/examples/enums.cu
+++ b/examples/enums.cu
@@ -17,7 +17,6 @@
 */

 #include <nvbench/nvbench.cuh>
-
 #include <nvbench/test_kernels.cuh>

 // Enum to use as parameter axis:
@@ -68,12 +67,10 @@ void runtime_enum_sweep_string(nvbench::state &state)
  // Create inputs, etc, configure runtime kernel parameters, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
-NVBENCH_BENCH(runtime_enum_sweep_string)
-  .add_string_axis("MyEnum", {"A", "B", "C"});
+NVBENCH_BENCH(runtime_enum_sweep_string).add_string_axis("MyEnum", {"A", "B", "C"});

 //==============================================================================
 // Sweep through enum values at runtime using an int64 axis.
@@ -97,9 +94,8 @@ void runtime_enum_sweep_int64(nvbench::state &state)
  // Create inputs, etc, configure runtime kernel parameters, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 NVBENCH_BENCH(runtime_enum_sweep_int64)
  .add_int64_axis("MyEnum",
@@ -178,12 +174,10 @@ void compile_time_enum_sweep(nvbench::state &state,
  // Template parameters, static dispatch, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
-using MyEnumList =
-  nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
+using MyEnumList = nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
 NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
  .set_type_axes_names({"MyEnum"});

@@ -199,16 +193,14 @@ NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
 //  * `-12` (struct std::integral_constant<int,-12>)
 // ```
 template <nvbench::int32_t IntValue>
-void compile_time_int_sweep(nvbench::state &state,
-                            nvbench::type_list<nvbench::enum_type<IntValue>>)
+void compile_time_int_sweep(nvbench::state &state, nvbench::type_list<nvbench::enum_type<IntValue>>)
 {
  // Use IntValue in compile time contexts.
  // Template parameters, static dispatch, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 using MyInts = nvbench::enum_type_list<0, 16, 4096, -12>;
 NVBENCH_BENCH_TYPES(compile_time_int_sweep, NVBENCH_TYPE_AXES(MyInts))
--- a/examples/exec_tag_sync.cu
+++ b/examples/exec_tag_sync.cu
@@ -53,9 +53,7 @@ void sequence_bench(nvbench::state &state)

  // nvbench::exec_tag::sync indicates that this will implicitly sync:
  state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
-    thrust::sequence(thrust::device.on(launch.get_stream()),
-                     data.begin(),
-                     data.end());
+    thrust::sequence(thrust::device.on(launch.get_stream()), data.begin(), data.end());
  });
 }
 NVBENCH_BENCH(sequence_bench);
--- a/examples/exec_tag_timer.cu
+++ b/examples/exec_tag_timer.cu
@@ -23,8 +23,8 @@

 // Thrust simplifies memory management, etc:
 #include <thrust/copy.h>
-#include <thrust/execution_policy.h>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
 #include <thrust/sequence.h>

 // mod2_inplace performs an in-place mod2 over every element in `data`. `data`
@@ -54,7 +54,7 @@ void mod2_inplace(nvbench::state &state)
  state.exec(nvbench::exec_tag::timer,
             // Lambda now takes a `timer` argument:
             [&input, &data, num_values](nvbench::launch &launch, auto &timer) {
-               (void) num_values; // clang thinks this is unused...
+               (void)num_values; // clang thinks this is unused...

               // Reset working data:
               thrust::copy(thrust::device.on(launch.get_stream()),
--- a/examples/skip.cu
+++ b/examples/skip.cu
@@ -72,14 +72,12 @@ NVBENCH_BENCH(runtime_skip)
 // Two type axes are swept, but configurations where InputType == OutputType are
 // skipped.
 template <typename InputType, typename OutputType>
-void skip_overload(nvbench::state &state,
-                   nvbench::type_list<InputType, OutputType>)
+void skip_overload(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // This is a contrived example that focuses on the skip overloads, so this is
  // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 // Overload of skip_overload that is called when InputType == OutputType.
 template <typename T>
@@ -107,9 +105,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // This is a contrived example that focuses on the skip overloads, so this is
  // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 // Enable this overload if InputType is larger than OutputType
 template <typename InputType, typename OutputType>
@@ -119,10 +116,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
  state.skip("sizeof(InputType) > sizeof(OutputType).");
 }
 // The same type_list is used for both inputs/outputs.
-using sn_types = nvbench::type_list<nvbench::int8_t,
-                                    nvbench::int16_t,
-                                    nvbench::int32_t,
-                                    nvbench::int64_t>;
+using sn_types =
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
 // Setup benchmark:
 NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
  .set_type_axes_names({"In", "Out"});
--- a/examples/stream.cu
+++ b/examples/stream.cu
@@ -52,7 +52,7 @@ void stream_bench(nvbench::state &state)
  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));

  state.exec([&input, &output, num_values](nvbench::launch &) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    copy(thrust::raw_pointer_cast(input.data()),
         thrust::raw_pointer_cast(output.data()),
         num_values);
--- a/examples/summaries.cu
+++ b/examples/summaries.cu
@@ -26,8 +26,8 @@
 void summary_example(nvbench::state &state)
 {
  // Fetch parameters and compute duration in seconds:
-  const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
-  const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
+  const auto ms       = static_cast<nvbench::float64_t>(state.get_int64("ms"));
+  const auto us       = static_cast<nvbench::float64_t>(state.get_int64("us"));
  const auto duration = ms * 1e-3 + us * 1e-6;

  // Add a new column to the summary table with the derived duration used by the benchmark.
--- a/examples/throughput.cu
+++ b/examples/throughput.cu
@@ -51,7 +51,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);

  state.exec([&input, &output, num_values](nvbench::launch &launch) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),