From ca0e795b46f32f97acfdce88e42374ad09089aeb Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 30 Apr 2025 21:40:33 +0200
Subject: [PATCH] Merge pull request #113 from elstehle/fix/per-device-stream

Fixes cudaErrorInvalidValue when running on nvbench-created cuda stream
---
 nvbench/cuda_stream.cuh         | 31 ++++++++++---
 nvbench/detail/measure_cold.cu  |  2 +-
 nvbench/detail/measure_cupti.cu |  2 +-
 nvbench/detail/measure_hot.cu   |  2 +-
 nvbench/state.cuh               | 19 +++++++-
 nvbench/state.cxx               |  2 +
 testing/CMakeLists.txt          |  1 +
 testing/cuda_stream.cu          | 77 +++++++++++++++++++++++++++++++++
 testing/state.cu                |  4 ++
 9 files changed, 130 insertions(+), 10 deletions(-)
 create mode 100644 testing/cuda_stream.cu
diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index cd2ab1e..99a2de6 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -19,10 +19,13 @@
 #pragma once
 
 #include <nvbench/cuda_call.cuh>
+#include <nvbench/detail/device_scope.cuh>
+#include <nvbench/device_info.cuh>
 
 #include <cuda_runtime_api.h>
 
 #include <memory>
+#include <optional>
 
 namespace nvbench
 {
@@ -39,18 +42,36 @@ namespace nvbench
 struct cuda_stream
 {
   /**
-   * Constructs a cuda_stream that owns a new stream, created with
-   * `cudaStreamCreate`.
+   * Constructs a cuda_stream that owns a new stream, created with `cudaStreamCreate`.
+   *
+   * @param device The device that this stream should be associated with. If no device is provided,
+   * the stream will be associated with the device that is active at the call time.
    */
-  cuda_stream()
-      : m_stream{[]() {
+  explicit cuda_stream(std::optional<nvbench::device_info> device)
+      : m_stream{[device]() {
                    cudaStream_t s;
-                   NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
+                   if (device.has_value())
+                   {
+                     nvbench::detail::device_scope scope_guard{device.value().get_id()};
+                     NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
+                   }
+                   else
+                   {
+                     NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
+                   }
                    return s;
                  }(),
                  stream_deleter{true}}
   {}
 
+  /**
+   * @brief Constructs a new cuda_stream tha is associated with the device that is active at the
+   * call time.
+   */
+  cuda_stream()
+      : cuda_stream(std::nullopt)
+  {}
+
   /**
    * Constructs a `cuda_stream` from an explicit cudaStream_t.
    *
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 6f0f5dd..92807e6 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -37,7 +37,7 @@ namespace nvbench::detail
 
 measure_cold_base::measure_cold_base(state &exec_state)
     : m_state{exec_state}
-    , m_launch{m_state.get_cuda_stream()}
+    , m_launch{exec_state.get_cuda_stream()}
     , m_criterion_params{exec_state.get_criterion_params()}
     , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
         exec_state.get_stopping_criterion())}
diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
index e583cd5..24028f2 100644
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -165,7 +165,7 @@ measure_cupti_base::measure_cupti_base(state &exec_state)
 // (formatter doesn't handle `try :` very well...)
 try
   : m_state{exec_state}
-  , m_launch{m_state.get_cuda_stream()}
+  , m_launch{exec_state.get_cuda_stream()}
   , m_cupti{*m_state.get_device(), add_metrics(m_state)}
 {}
 // clang-format on
diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
index 2a38f16..202cda5 100644
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -36,7 +36,7 @@ namespace nvbench::detail
 
 measure_hot_base::measure_hot_base(state &exec_state)
     : m_state{exec_state}
-    , m_launch{m_state.get_cuda_stream()}
+    , m_launch{exec_state.get_cuda_stream()}
     , m_min_samples{exec_state.get_min_samples()}
     , m_min_time{exec_state.get_min_time()}
     , m_skip_time{exec_state.get_skip_time()}
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 0691dc6..af402c6 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -64,7 +64,21 @@ struct state
   state &operator=(const state &) = delete;
   state &operator=(state &&)      = default;
 
-  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
+  /// If a stream exists, return that. Otherwise, create a new stream using the current
+  /// device (or the current device if none is set), save it, and return it.
+  /// @sa get_cuda_stream_optional
+  [[nodiscard]] nvbench::cuda_stream &get_cuda_stream()
+  {
+    if (!m_cuda_stream.has_value())
+    {
+      m_cuda_stream = nvbench::cuda_stream{m_device};
+    }
+    return m_cuda_stream.value();
+  }
+  [[nodiscard]] const std::optional<nvbench::cuda_stream> &get_cuda_stream_optional() const
+  {
+    return m_cuda_stream;
+  }
   void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }
 
   /// The CUDA device associated with with this benchmark state. May be
@@ -313,7 +327,6 @@ private:
         std::optional<nvbench::device_info> device,
         std::size_t type_config_index);
 
-  nvbench::cuda_stream m_cuda_stream;
   std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
   nvbench::named_values m_axis_values;
   std::optional<nvbench::device_info> m_device;
@@ -334,6 +347,8 @@ private:
   nvbench::float32_t m_throttle_threshold;      // [% of default SM clock rate]
   nvbench::float32_t m_throttle_recovery_delay; // [seconds]
 
+  std::optional<nvbench::cuda_stream> m_cuda_stream;
+
   // Deadlock protection. See blocking_kernel's class doc for details.
   nvbench::float64_t m_blocking_kernel_timeout{30.0};
 
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index df4ec91..d29d40b 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -42,6 +42,7 @@ state::state(const benchmark_base &bench)
     , m_timeout{bench.get_timeout()}
     , m_throttle_threshold{bench.get_throttle_threshold()}
     , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
+    , m_cuda_stream{std::nullopt}
 {}
 
 state::state(const benchmark_base &bench,
@@ -62,6 +63,7 @@ state::state(const benchmark_base &bench,
     , m_timeout{bench.get_timeout()}
     , m_throttle_threshold{bench.get_throttle_threshold()}
     , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
+    , m_cuda_stream{std::nullopt}
 {}
 
 nvbench::int64_t state::get_int64(const std::string &axis_name) const
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index f407258..5ee7824 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -3,6 +3,7 @@ set(test_srcs
   benchmark.cu
   create.cu
   cuda_timer.cu
+  cuda_stream.cu
   cpu_timer.cu
   criterion_manager.cu
   criterion_params.cu
diff --git a/testing/cuda_stream.cu b/testing/cuda_stream.cu
new file mode 100644
index 0000000..05bd902
--- /dev/null
+++ b/testing/cuda_stream.cu
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/config.cuh>
+#include <nvbench/cuda_stream.cuh>
+#include <nvbench/device_manager.cuh>
+#include <nvbench/types.cuh>
+
+#include <fmt/format.h>
+
+#include "test_asserts.cuh"
+
+namespace
+{
+#ifdef NVBENCH_HAS_CUPTI
+/**
+ * @brief Queries and returns the device id that the given \p cuda_stream is associated with
+ *
+ * @param cuda_stream The stream to get the device id for
+ * @return The device id that \p cuda_stream is associated with
+ */
+int get_device_of_stream(cudaStream_t cuda_stream)
+{
+  CUcontext ctx;
+  NVBENCH_DRIVER_API_CALL(cuStreamGetCtx(CUstream{cuda_stream}, &ctx));
+  NVBENCH_DRIVER_API_CALL(cuCtxPushCurrent(ctx));
+  CUdevice device_id{};
+  NVBENCH_DRIVER_API_CALL(cuCtxGetDevice(&device_id));
+  NVBENCH_DRIVER_API_CALL(cuCtxPopCurrent(&ctx));
+  return static_cast<int>(device_id);
+}
+#endif
+} // namespace
+
+void test_basic()
+{
+#ifdef NVBENCH_HAS_CUPTI
+  // Get devices
+  auto devices = nvbench::device_manager::get().get_devices();
+
+  // Iterate over devices
+  for (auto const &device_info : devices)
+  {
+    // Create stream on the device before it becomes the active device
+    nvbench::cuda_stream device_stream(device_info);
+
+    // Verify cuda stream is associated with the correct cuda device
+    ASSERT(get_device_of_stream(device_stream.get_stream()) == device_info.get_id());
+
+    // Set the device as active device
+    device_info.set_active();
+
+    // Create the stream (implicitly) on the device that is currently active
+    nvbench::cuda_stream current_device_stream{};
+
+    // Verify the cuda stream was in fact associated with the currently active device
+    ASSERT(get_device_of_stream(current_device_stream.get_stream()) == device_info.get_id());
+  }
+#endif
+}
+
+int main() { test_basic(); }
diff --git a/testing/state.cu b/testing/state.cu
index de7848b..5198943 100644
--- a/testing/state.cu
+++ b/testing/state.cu
@@ -55,9 +55,13 @@ void test_streams()
 
   state_tester state{bench};
 
+  // Confirm that the stream hasn't been initialized yet
+  ASSERT(!state.get_cuda_stream_optional().has_value());
+
   // Test non-owning stream
   cudaStream_t default_stream = 0;
   state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});
+  ASSERT(state.get_cuda_stream_optional() == default_stream);
   ASSERT(state.get_cuda_stream() == default_stream);
 
   // Test owning stream