From ca0e795b46f32f97acfdce88e42374ad09089aeb Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 30 Apr 2025 21:40:33 +0200 Subject: [PATCH] Merge pull request #113 from elstehle/fix/per-device-stream Fixes cudaErrorInvalidValue when running on nvbench-created cuda stream --- nvbench/cuda_stream.cuh | 31 ++++++++++--- nvbench/detail/measure_cold.cu | 2 +- nvbench/detail/measure_cupti.cu | 2 +- nvbench/detail/measure_hot.cu | 2 +- nvbench/state.cuh | 19 +++++++- nvbench/state.cxx | 2 + testing/CMakeLists.txt | 1 + testing/cuda_stream.cu | 77 +++++++++++++++++++++++++++++++++ testing/state.cu | 4 ++ 9 files changed, 130 insertions(+), 10 deletions(-) create mode 100644 testing/cuda_stream.cu diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index cd2ab1e..99a2de6 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -19,10 +19,13 @@ #pragma once #include +#include +#include #include #include +#include namespace nvbench { @@ -39,18 +42,36 @@ namespace nvbench struct cuda_stream { /** - * Constructs a cuda_stream that owns a new stream, created with - * `cudaStreamCreate`. + * Constructs a cuda_stream that owns a new stream, created with `cudaStreamCreate`. + * + * @param device The device that this stream should be associated with. If no device is provided, + * the stream will be associated with the device that is active at the call time. */ - cuda_stream() - : m_stream{[]() { + explicit cuda_stream(std::optional device) + : m_stream{[device]() { cudaStream_t s; - NVBENCH_CUDA_CALL(cudaStreamCreate(&s)); + if (device.has_value()) + { + nvbench::detail::device_scope scope_guard{device.value().get_id()}; + NVBENCH_CUDA_CALL(cudaStreamCreate(&s)); + } + else + { + NVBENCH_CUDA_CALL(cudaStreamCreate(&s)); + } return s; }(), stream_deleter{true}} {} + /** + * @brief Constructs a new cuda_stream tha is associated with the device that is active at the + * call time. + */ + cuda_stream() + : cuda_stream(std::nullopt) + {} + /** * Constructs a `cuda_stream` from an explicit cudaStream_t. * diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 6f0f5dd..92807e6 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -37,7 +37,7 @@ namespace nvbench::detail measure_cold_base::measure_cold_base(state &exec_state) : m_state{exec_state} - , m_launch{m_state.get_cuda_stream()} + , m_launch{exec_state.get_cuda_stream()} , m_criterion_params{exec_state.get_criterion_params()} , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion( exec_state.get_stopping_criterion())} diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu index e583cd5..24028f2 100644 --- a/nvbench/detail/measure_cupti.cu +++ b/nvbench/detail/measure_cupti.cu @@ -165,7 +165,7 @@ measure_cupti_base::measure_cupti_base(state &exec_state) // (formatter doesn't handle `try :` very well...) try : m_state{exec_state} - , m_launch{m_state.get_cuda_stream()} + , m_launch{exec_state.get_cuda_stream()} , m_cupti{*m_state.get_device(), add_metrics(m_state)} {} // clang-format on diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu index 2a38f16..202cda5 100644 --- a/nvbench/detail/measure_hot.cu +++ b/nvbench/detail/measure_hot.cu @@ -36,7 +36,7 @@ namespace nvbench::detail measure_hot_base::measure_hot_base(state &exec_state) : m_state{exec_state} - , m_launch{m_state.get_cuda_stream()} + , m_launch{exec_state.get_cuda_stream()} , m_min_samples{exec_state.get_min_samples()} , m_min_time{exec_state.get_min_time()} , m_skip_time{exec_state.get_skip_time()} diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 0691dc6..af402c6 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -64,7 +64,21 @@ struct state state &operator=(const state &) = delete; state &operator=(state &&) = default; - [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; } + /// If a stream exists, return that. Otherwise, create a new stream using the current + /// device (or the current device if none is set), save it, and return it. + /// @sa get_cuda_stream_optional + [[nodiscard]] nvbench::cuda_stream &get_cuda_stream() + { + if (!m_cuda_stream.has_value()) + { + m_cuda_stream = nvbench::cuda_stream{m_device}; + } + return m_cuda_stream.value(); + } + [[nodiscard]] const std::optional &get_cuda_stream_optional() const + { + return m_cuda_stream; + } void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); } /// The CUDA device associated with with this benchmark state. May be @@ -313,7 +327,6 @@ private: std::optional device, std::size_t type_config_index); - nvbench::cuda_stream m_cuda_stream; std::reference_wrapper m_benchmark; nvbench::named_values m_axis_values; std::optional m_device; @@ -334,6 +347,8 @@ private: nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate] nvbench::float32_t m_throttle_recovery_delay; // [seconds] + std::optional m_cuda_stream; + // Deadlock protection. See blocking_kernel's class doc for details. nvbench::float64_t m_blocking_kernel_timeout{30.0}; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index df4ec91..d29d40b 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -42,6 +42,7 @@ state::state(const benchmark_base &bench) , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()} + , m_cuda_stream{std::nullopt} {} state::state(const benchmark_base &bench, @@ -62,6 +63,7 @@ state::state(const benchmark_base &bench, , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()} + , m_cuda_stream{std::nullopt} {} nvbench::int64_t state::get_int64(const std::string &axis_name) const diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index f407258..5ee7824 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -3,6 +3,7 @@ set(test_srcs benchmark.cu create.cu cuda_timer.cu + cuda_stream.cu cpu_timer.cu criterion_manager.cu criterion_params.cu diff --git a/testing/cuda_stream.cu b/testing/cuda_stream.cu new file mode 100644 index 0000000..05bd902 --- /dev/null +++ b/testing/cuda_stream.cu @@ -0,0 +1,77 @@ +/* + * Copyright 2023 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include "test_asserts.cuh" + +namespace +{ +#ifdef NVBENCH_HAS_CUPTI +/** + * @brief Queries and returns the device id that the given \p cuda_stream is associated with + * + * @param cuda_stream The stream to get the device id for + * @return The device id that \p cuda_stream is associated with + */ +int get_device_of_stream(cudaStream_t cuda_stream) +{ + CUcontext ctx; + NVBENCH_DRIVER_API_CALL(cuStreamGetCtx(CUstream{cuda_stream}, &ctx)); + NVBENCH_DRIVER_API_CALL(cuCtxPushCurrent(ctx)); + CUdevice device_id{}; + NVBENCH_DRIVER_API_CALL(cuCtxGetDevice(&device_id)); + NVBENCH_DRIVER_API_CALL(cuCtxPopCurrent(&ctx)); + return static_cast(device_id); +} +#endif +} // namespace + +void test_basic() +{ +#ifdef NVBENCH_HAS_CUPTI + // Get devices + auto devices = nvbench::device_manager::get().get_devices(); + + // Iterate over devices + for (auto const &device_info : devices) + { + // Create stream on the device before it becomes the active device + nvbench::cuda_stream device_stream(device_info); + + // Verify cuda stream is associated with the correct cuda device + ASSERT(get_device_of_stream(device_stream.get_stream()) == device_info.get_id()); + + // Set the device as active device + device_info.set_active(); + + // Create the stream (implicitly) on the device that is currently active + nvbench::cuda_stream current_device_stream{}; + + // Verify the cuda stream was in fact associated with the currently active device + ASSERT(get_device_of_stream(current_device_stream.get_stream()) == device_info.get_id()); + } +#endif +} + +int main() { test_basic(); } diff --git a/testing/state.cu b/testing/state.cu index de7848b..5198943 100644 --- a/testing/state.cu +++ b/testing/state.cu @@ -55,9 +55,13 @@ void test_streams() state_tester state{bench}; + // Confirm that the stream hasn't been initialized yet + ASSERT(!state.get_cuda_stream_optional().has_value()); + // Test non-owning stream cudaStream_t default_stream = 0; state.set_cuda_stream(nvbench::cuda_stream{default_stream, false}); + ASSERT(state.get_cuda_stream_optional() == default_stream); ASSERT(state.get_cuda_stream() == default_stream); // Test owning stream