Set CUDA_MODULE_LOADING=EAGER before main. (#157)

* Set `CUDA_MODULE_LOADING=EAGER` before `main`. Fixes #136 * Portability for `setenv`. * Remove pre-main CUDART usage and setup env in main. * Fail examples if they deadlock. This is the best way we have to diagnose a regression for NVIDIA/nvbench#136. * Add an initialize method to benchmark_manager for CUDA-related setup. Benchmarks are created statically, so their constructors cannot call the CUDA APIs without breaking the CUDA_MODULE_LOAD setup. This method is called from `main` after the environment has been configured.
2026-05-11 08:50:03 +00:00 · 2024-04-06 11:03:42 -04:00
parent e8c8877d36
commit a2f88ff790
6 changed files with 39 additions and 2 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -26,6 +26,12 @@ foreach(example_src IN LISTS example_srcs)
    COMMAND "$<TARGET_FILE:${example_name}>" --timeout 0.1 --min-time 1e-5
  )

+  # These should not deadlock. If they do, it may be that the CUDA context was created before
+  # setting CUDA_MODULE_LOAD=EAGER in main, see NVIDIA/nvbench#136.
+  set_tests_properties(${example_name} PROPERTIES
+    FAIL_REGULAR_EXPRESSION "Possible Deadlock Detected"
+  )
+
  add_dependencies(nvbench.example.all ${example_name})
 endforeach()

--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -20,7 +20,6 @@

 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/device_info.cuh>
-#include <nvbench/device_manager.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/stopping_criterion.cuh>

@@ -53,7 +52,6 @@ struct benchmark_base
  template <typename TypeAxes>
  explicit benchmark_base(TypeAxes type_axes)
      : m_axes(type_axes)
-      , m_devices(nvbench::device_manager::get().get_devices())
  {}

  virtual ~benchmark_base();
--- a/nvbench/benchmark_manager.cuh
+++ b/nvbench/benchmark_manager.cuh
@@ -38,6 +38,15 @@ struct benchmark_manager
   */
  [[nodiscard]] static benchmark_manager &get();

+  /**
+   * Setup any default values for the benchmarks. Invoked from `main`.
+   *
+   * Specifically, any CUDA calls (e.g. cudaGetDeviceProperties, etc) needed to initialize the
+   * benchmarks should be done here to avoid creating a CUDA context before we configure the CUDA
+   * environment in `main`.
+   */
+   void initialize();
+
  /**
   * Register a new benchmark.
   */
--- a/nvbench/benchmark_manager.cxx
+++ b/nvbench/benchmark_manager.cxx
@@ -18,6 +18,7 @@

 #include <nvbench/benchmark_manager.cuh>

+#include <nvbench/device_manager.cuh>
 #include <nvbench/detail/throw.cuh>

 #include <fmt/format.h>
@@ -34,6 +35,15 @@ benchmark_manager &benchmark_manager::get()
  return the_manager;
 }

+void benchmark_manager::initialize()
+{
+  const auto& mgr = device_manager::get();
+  for (auto& bench : m_benchmarks)
+  {
+    bench->set_devices(mgr.get_devices());
+  }
+}
+
 benchmark_base &benchmark_manager::add(std::unique_ptr<benchmark_base> bench)
 {
  m_benchmarks.push_back(std::move(bench));
--- a/nvbench/main.cuh
+++ b/nvbench/main.cuh
@@ -25,6 +25,7 @@
 #include <nvbench/option_parser.cuh>
 #include <nvbench/printer_base.cuh>

+#include <cstdlib>
 #include <iostream>

 #define NVBENCH_MAIN                                                                               \
@@ -58,10 +59,22 @@
  nvbench::option_parser parser;                                                                   \
  parser.parse(argc, argv)

+// See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
+#ifdef _MSC_VER
+#define NVBENCH_INITIALIZE_CUDA_ENV _putenv_s("CUDA_MODULE_LOADING", "EAGER")
+#else
+#define NVBENCH_INITIALIZE_CUDA_ENV setenv("CUDA_MODULE_LOADING", "EAGER", 1)
+#endif
+
+#define NVBENCH_INITIALIZE_BENCHMARKS()                                                            \
+  nvbench::benchmark_manager::get().initialize()
+
 #define NVBENCH_MAIN_BODY(argc, argv)                                                              \
  do                                                                                               \
  {                                                                                                \
+    NVBENCH_INITIALIZE_CUDA_ENV;                                                                   \
    NVBENCH_INITIALIZE_DRIVER_API;                                                                 \
+    NVBENCH_INITIALIZE_BENCHMARKS();                                                               \
    NVBENCH_MAIN_PARSE(argc, argv);                                                                \
    auto &printer = parser.get_printer();                                                          \
                                                                                                   \
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -22,6 +22,7 @@
 #include <nvbench/benchmark_manager.cuh>
 #include <nvbench/csv_printer.cuh>
 #include <nvbench/criterion_manager.cuh>
+#include <nvbench/device_manager.cuh>
 #include <nvbench/git_revision.cuh>
 #include <nvbench/json_printer.cuh>
 #include <nvbench/markdown_printer.cuh>