Add NVML support for persistence mode, locking clocks.

Locking clocks is currently only implemented for Volta+ devices. Example usage: my_bench -d [0,1,3] --persistence-mode 1 --lock-gpu-clocks base See the cli_help.md docs for more info.
2026-05-11 00:40:00 +00:00 · 2021-10-21 01:01:16 -04:00
parent d0c90ff920
commit b948e79cab
18 changed files with 656 additions and 19 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,5 @@
 # 3.20.1 required for rapids-cmake
+# 3.21.0 required for NVBench_ADD_DEPENDENT_DLLS_TO_* (MSVC only)
 cmake_minimum_required(VERSION 3.20.1)

 set(CMAKE_CXX_STANDARD 17)
@@ -15,17 +16,21 @@ project(NVBench

 nvbench_init_rapids_cmake()

+option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
+
+option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
+option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
+
 include(cmake/NVBenchConfigTarget.cmake)
-include(cmake/NVBenchDependencies.cmake)
+include(cmake/NVBenchDependentDlls.cmake)
 include(cmake/NVBenchExports.cmake)
+include(cmake/NVBenchWriteConfigHeader.cmake)
+include(cmake/NVBenchDependencies.cmake)
 include(cmake/NVBenchInstallRules.cmake)
 include(cmake/NVBenchUtilities.cmake)

 message(STATUS "NVBench CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

-option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
-option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
-
 add_subdirectory(nvbench)

 if (NVBench_ENABLE_EXAMPLES OR NVBench_ENABLE_TESTING)
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ features:
  be dynamic numbers/strings or [static types](docs/benchmarks.md#type-axes).
 * [Runtime customization](docs/cli_help.md): A rich command-line interface
  allows [redefinition of parameter axes](docs/cli_help_axis.md), CUDA device
-  selection, changing output formats, and more.
+  selection, locking GPU clocks (Volta+), changing output formats, and more.
 * [Throughput calculations](docs/benchmarks.md#throughput-measurements): Compute
  and report:
  * Item throughput (elements/second)
--- a/cmake/NVBenchDependencies.cmake
+++ b/cmake/NVBenchDependencies.cmake
@@ -57,3 +57,10 @@ rapids_find_package(CUDAToolkit REQUIRED

 # Append CTK targets to this as we add optional deps (NMVL, CUPTI, ...)
 set(ctk_libraries CUDA::toolkit)
+
+################################################################################
+# CUDAToolkit -> NVML
+if (NVBench_ENABLE_NVML)
+  include("${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake")
+  list(APPEND ctk_libraries nvbench::nvml)
+endif()
--- a/cmake/NVBenchDependentDlls.cmake
+++ b/cmake/NVBenchDependentDlls.cmake
@@ -0,0 +1,36 @@
+# By default, add dependent DLLs to the build dir on MSVC. This avoids
+# a variety of runtime issues when using NVML, etc.
+# This behavior can be disabled using the following options:
+if (WIN32)
+  option(NVBench_ADD_DEPENDENT_DLLS_TO_BUILD
+    "Copy dependent dlls to NVBench library build location (MSVC only)."
+    ON
+  )
+else()
+  # These are forced off for non-MSVC builds, as $<TARGET_RUNTIME_DLLS:...>
+  # will always be empty on non-dll platforms.
+  set(NVBench_ADD_DEPENDENT_DLLS_TO_BUILD OFF)
+endif()
+
+if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD)
+  message(STATUS
+    "CMake 3.21.0 is required when NVBench_ADD_DEPENDENT_DLLS_TO_BUILD "
+    "is enabled."
+  )
+  cmake_minimum_required(VERSION 3.21.0)
+endif()
+
+function(nvbench_setup_dep_dlls target_name)
+  # The custom command below fails when there aren't any runtime DLLs to copy,
+  # so only enable it when a relevant dependency is enabled:
+  if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND NVBench_ENABLE_NVML)
+    add_custom_command(TARGET ${target_name}
+      POST_BUILD
+      COMMAND
+        "${CMAKE_COMMAND}" -E copy
+          "$<TARGET_RUNTIME_DLLS:${target_name}>"
+          "$<TARGET_FILE_DIR:${target_name}>"
+      COMMAND_EXPAND_LISTS
+    )
+  endif()
+endfunction()
--- a/cmake/NVBenchExports.cmake
+++ b/cmake/NVBenchExports.cmake
@@ -1,14 +1,28 @@
 macro(nvbench_generate_exports)
+  set(nvbench_build_export_code_block "")
+  set(nvbench_install_export_code_block "")
+
+  if (NVBench_ENABLE_NVML)
+    string(APPEND nvbench_build_export_code_block
+      "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake\")\n"
+    )
+    string(APPEND nvbench_install_export_code_block
+      "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake\")\n"
+    )
+  endif()
+
  rapids_export(BUILD NVBench
    EXPORT_SET nvbench-targets
    NAMESPACE "nvbench::"
    GLOBAL_TARGETS nvbench main
    LANGUAGES CUDA CXX
+    FINAL_CODE_BLOCK nvbench_build_export_code_block
  )
  rapids_export(INSTALL NVBench
    EXPORT_SET nvbench-targets
    NAMESPACE "nvbench::"
    GLOBAL_TARGETS nvbench main
    LANGUAGES CUDA CXX
+    FINAL_CODE_BLOCK nvbench_install_export_code_block
  )
 endmacro()
--- a/cmake/NVBenchInstallRules.cmake
+++ b/cmake/NVBenchInstallRules.cmake
@@ -10,13 +10,35 @@ install(DIRECTORY "${NVBench_SOURCE_DIR}/nvbench"
 )

 # generated headers from build dir:
-install(FILES
-  "${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
-  "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
-
+install(
+  FILES
+    "${NVBench_BINARY_DIR}/nvbench/config.cuh"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench"
+)
+install(
+  FILES
+    "${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
+    "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench/detail"
 )

+#
+# Install CMake files needed by consumers to locate dependencies:
+#
+
+# Borrowing this logic from rapids_cmake's export logic to make sure these end
+# up in the same location as nvbench-config.cmake:
+rapids_cmake_install_lib_dir(config_install_location)
+set(config_install_location "${config_install_location}/cmake/nvbench")
+
+if (NVBench_ENABLE_NVML)
+  install(
+    FILES
+      "${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake"
+    DESTINATION "${config_install_location}"
+  )
+endif()
+
 # Call with a list of library targets to generate install rules:
 function(nvbench_install_libraries)
  install(TARGETS ${ARGN}
--- a/cmake/NVBenchNVML.cmake
+++ b/cmake/NVBenchNVML.cmake
@@ -0,0 +1,37 @@
+# Since this file is installed, we need to make sure that the CUDAToolkit has
+# been found by consumers:
+if (NOT TARGET CUDA::toolkit)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+if (WIN32)
+  # The CUDA:: targets currently don't provide dll locations through the
+  # `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
+  # (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
+  # CMake function from copying the dlls to the build / install directories.
+  # This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
+  # and the other CMake issues it links to.
+  #
+  # We create a nvbench-specific target that configures the nvml interface as
+  # described here:
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
+  #
+  # Use find_file instead of find_library, which would search for a .lib file.
+  # This is also nice because find_file searches recursively (find_library
+  # does not) and some versions of CTK nest nvml.dll several directories deep
+  # under C:\Windows\System32.
+  find_file(NVBench_NVML_DLL nvml.dll REQUIRED
+    DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
+    PATHS "C:/Windows/System32"
+  )
+  mark_as_advanced(NVBench_NVML_DLL)
+  add_library(nvbench::nvml SHARED IMPORTED)
+  target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
+  set_target_properties(nvbench::nvml PROPERTIES
+    IMPORTED_LOCATION "${NVBench_NVML_DLL}"
+    IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
+  )
+else()
+  # Linux is much easier...
+  add_library(nvbench::nvml ALIAS CUDA::nvml)
+endif()
--- a/cmake/NVBenchWriteConfigHeader.cmake
+++ b/cmake/NVBenchWriteConfigHeader.cmake
@@ -0,0 +1,7 @@
+function(nvbench_write_config_header filepath)
+  if (NVBench_ENABLE_NVML)
+    set(NVBENCH_HAS_NVML 1)
+  endif()
+
+  configure_file("${NVBench_SOURCE_DIR}/cmake/config.cuh.in" "${filepath}")
+endfunction()
--- a/cmake/config.cuh.in
+++ b/cmake/config.cuh.in
@@ -0,0 +1,22 @@
+/*
+*  Copyright 2021 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 with the LLVM exception
+*  (the "License"); you may not use this file except in compliance with
+*  the License.
+*
+*  You may obtain a copy of the License at
+*
+*      http://llvm.org/foundation/relicensing/LICENSE.txt
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
+
+#pragma once
+
+// Defined if NVBench has been built with NVML support.
+#cmakedefine NVBENCH_HAS_NVML
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -12,6 +12,33 @@
 * `--version`
  * Print information about the version of NVBench used to build the executable.

+# Device Modification
+
+* `--persistence-mode <state>`, `--pm <state>`
+  * Sets persistence mode for one or more GPU devices.
+  * Applies to the devices described by the most recent `--devices` option,
+    or all devices if `--devices` is not specified.
+  * This option requires root / admin permissions.
+  * This option is only supported on Linux.
+  * This call must precede all other device modification options, if any.
+  * Note that persistence mode is deprecated and will be removed at some point
+    in favor of the new persistence daemon. See the following link for more
+    details: https://docs.nvidia.com/deploy/driver-persistence/index.html
+  * Valid values for `state` are:
+    * `0`: Disable persistence mode.
+    * `1`: Enable persistence mode.
+
+* `--lock-gpu-clocks <rate>`, `--lgc <rate>`
+  * Lock GPU clocks for one or more devices to a particular rate.
+  * Applies to the devices described by the most recent `--devices` option,
+    or all devices if `--devices` is not specified.
+  * This option requires root / admin permissions.
+  * This option is only supported in Volta+ (sm_70+) devices.
+  * Valid values for `rate` are:
+    * `reset`, `unlock`, `none`: Unlock the GPU clocks.
+    * `base`, `tdp`: Lock clocks to base frequency (best for stable results).
+    * `max`, `maximum`: Lock clocks to max frequency (best for fastest results).
+
 # Output

 * `--csv <filename/stream>`
@@ -51,7 +78,7 @@

 * `--devices <device ids>`, `--device <device ids>`, `-d <device ids>`
  * Limit execution to one or more devices.
-  * `<device ids>` is a single id, or a comma separated list.
+  * `<device ids>` is a single id, a comma separated list, or the string "all".
  * Device ids can be obtained from `--list`.
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.
--- a/nvbench/CMakeLists.txt
+++ b/nvbench/CMakeLists.txt
@@ -26,6 +26,10 @@ set(srcs
  detail/state_generator.cxx
 )

+if (NVBench_ENABLE_NVML)
+  list(APPEND srcs internal/nvml.cxx)
+endif()
+
 # CUDA 11.0 can't compile json_printer without crashing
 # So for that version fall back to C++ with degraded
 # output ( no PTX version info )
@@ -49,6 +53,8 @@ file_to_string("../docs/cli_help_axis.md"
  cli_help_axis_text
 )

+nvbench_write_config_header("${NVBench_BINARY_DIR}/nvbench/config.cuh")
+
 # nvbench (nvbench::nvbench)
 add_library(nvbench SHARED ${srcs})
 target_include_directories(nvbench PUBLIC
@@ -77,4 +83,5 @@ add_dependencies(nvbench.all nvbench_main)
 add_library(nvbench::nvbench ALIAS nvbench)
 add_library(nvbench::main ALIAS nvbench_main)

+nvbench_setup_dep_dlls(nvbench)
 nvbench_install_libraries(nvbench nvbench_main)
--- a/nvbench/device_info.cu
+++ b/nvbench/device_info.cu
@@ -18,8 +18,10 @@

 #include <nvbench/device_info.cuh>

+#include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
+#include <nvbench/internal/nvml.cuh>

 #include <cuda_runtime_api.h>

@@ -38,8 +40,108 @@ device_info::memory_info device_info::get_global_memory_usage() const
 device_info::device_info(int id)
    : m_id{id}
    , m_prop{}
+    , m_nvml_device(nullptr)
 {
  NVBENCH_CUDA_CALL(cudaGetDeviceProperties(&m_prop, m_id));
+
+#ifdef NVBENCH_HAS_NVML
+  // Retrieve the current device's pci_id as a null-terminated string.
+  // Docs say 13 chars should always be sufficient.
+  constexpr int pci_id_len = 13;
+  char pci_id[pci_id_len];
+  NVBENCH_CUDA_CALL(cudaDeviceGetPCIBusId(pci_id, pci_id_len, m_id));
+  NVBENCH_NVML_CALL(nvmlDeviceGetHandleByPciBusId(pci_id, &m_nvml_device));
+#endif // NVBENCH_HAS_NVML
 }

+void device_info::set_persistence_mode(bool state)
+#ifndef NVBENCH_HAS_NVML
+{
+  throw nvbench::nvml::not_enabled{};
+}
+#else  // NVBENCH_HAS_NVML
+try
+{
+  NVBENCH_NVML_CALL(nvmlDeviceSetPersistenceMode(
+    m_nvml_device,
+    state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
+}
+catch (nvml::call_failed &e)
+{
+  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED)
+  {
+    NVBENCH_THROW(std::runtime_error,
+                  "{}",
+                  "Persistence mode is only supported on Linux.");
+  }
+  else if (e.get_error_code() == NVML_ERROR_NO_PERMISSION)
+  {
+    NVBENCH_THROW(std::runtime_error,
+                  "{}",
+                  "Root/Admin permissions required to set persistence mode.");
+  }
+
+  throw;
+}
+#endif // NVBENCH_HAS_NVML
+
+void device_info::lock_gpu_clocks(device_info::clock_rate rate)
+#ifndef NVBENCH_HAS_NVML
+{
+  throw nvbench::nvml::not_enabled{};
+}
+#else  // NVBENCH_HAS_NVML
+try
+{
+  switch (rate)
+  {
+    case clock_rate::none:
+      NVBENCH_NVML_CALL(nvmlDeviceResetGpuLockedClocks(m_nvml_device));
+      break;
+
+    case clock_rate::base:
+      NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(
+        m_nvml_device,
+        static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
+        static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
+      break;
+
+    case clock_rate::maximum: {
+      const auto max_mhz = static_cast<unsigned int>(
+        this->get_sm_default_clock_rate() / (1000 * 1000));
+      NVBENCH_NVML_CALL(
+        nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
+      break;
+    }
+
+    default:
+      NVBENCH_THROW(std::runtime_error,
+                    "Unrecognized clock rate: {}",
+                    static_cast<int>(rate));
+  }
+}
+catch (nvml::call_failed &e)
+{
+  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED &&
+      this->get_ptx_version() < 700)
+  {
+    NVBENCH_THROW(std::runtime_error,
+                  "GPU clock rates can only be modified for Volta and later. "
+                  "Device: {} ({}) SM: {} < {}",
+                  this->get_name(),
+                  this->get_id(),
+                  this->get_ptx_version(),
+                  700);
+  }
+  else if (e.get_error_code() == NVML_ERROR_NO_PERMISSION)
+  {
+    NVBENCH_THROW(std::runtime_error,
+                  "{}",
+                  "Root/Admin permissions required to change GPU clock rates.");
+  }
+
+  throw;
+}
+#endif // NVBENCH_HAS_NVML
+
 } // namespace nvbench
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -27,6 +27,9 @@
 #include <string_view>
 #include <utility>

+// forward declare this for internal storage
+struct nvmlDevice_st;
+
 namespace nvbench
 {

@@ -66,13 +69,35 @@ struct device_info
    NVBENCH_CUDA_CALL(cudaSetDevice(m_id));
  }

+  /// Enable or disable persistence mode.
+  /// @note Only supported on Linux.
+  /// @note Requires root / admin privileges.
+  void set_persistence_mode(bool state);
+
+
+  /// Symbolic values for special clock rates
+  enum class clock_rate
+  {
+    /// Unlock clocks
+    none,
+    /// Base TDP clock; Preferred for stable benchmarking
+    base,
+    /// Maximum clock rate
+    maximum
+  };
+
+  /// Lock GPU clocks to the specified rate.
+  /// @note Only supported on Volta+ (sm_70+) devices.
+  /// @note Requires root / admin privileges.
+  void lock_gpu_clocks(clock_rate rate);
+
  /// @return The SM version of the current device as (major*100) + (minor*10).
  [[nodiscard]] int get_sm_version() const
  {
    return m_prop.major * 100 + m_prop.minor * 10;
  }

-  /// @return The PTX version of the current device
+  /// @return The PTX version of the current device, e.g. sm_80 returns 800.
  [[nodiscard]] __forceinline__ int get_ptx_version() const
  {
    return detail::get_ptx_version(m_id);
@@ -197,6 +222,7 @@ struct device_info
 private:
  int m_id;
  cudaDeviceProp m_prop;
+  nvmlDevice_st *m_nvml_device;
 };

 // get_ptx_version implementation; this needs to stay in the header so it will
--- a/nvbench/internal/nvml.cuh
+++ b/nvbench/internal/nvml.cuh
@@ -0,0 +1,119 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/config.cuh>
+#include <nvbench/detail/throw.cuh>
+
+#include <fmt/format.h>
+
+#ifdef NVBENCH_HAS_NVML
+#include <nvml.h>
+#endif // NVBENCH_HAS_NVML
+
+#include <stdexcept>
+
+namespace nvbench::nvml
+{
+
+/// Base class for NVML-specific exceptions
+struct error : std::runtime_error
+{
+  using runtime_error::runtime_error;
+};
+
+/// Thrown when NVML support is disabled.
+struct not_enabled : error
+{
+  not_enabled()
+      : error{"NVML not available. Reconfigure NVBench with the CMake option "
+              "`-DNVBench_ENABLE_NVML=ON`."}
+  {}
+};
+
+// Only `error` and `not_enabled` are defined when NVML is disabled.
+// Other exceptions may hold types defined by NVML.
+#ifdef NVBENCH_HAS_NVML
+
+/// Thrown when a generic NVML call inside NVBENCH_NVML_CALL fails
+struct call_failed : error
+{
+  call_failed(const std::string &filename,
+              std::size_t lineno,
+              const std::string &call,
+              nvmlReturn_t error_code,
+              std::string error_string)
+      : error(fmt::format("{}:{}:\n"
+                          "\tNVML call failed:\n"
+                          "\t\tCall: {}\n"
+                          "\t\tError: ({}) {}",
+                          filename,
+                          lineno,
+                          call,
+                          static_cast<int>(error_code),
+                          error_string))
+      , m_error_code(error_code)
+      , m_error_string(error_string)
+  {}
+
+  [[nodiscard]] nvmlReturn_t get_error_code() const { return m_error_code; }
+
+  [[nodiscard]] const std::string &get_error_string() const
+  {
+    return m_error_string;
+  }
+
+private:
+  nvmlReturn_t m_error_code;
+  std::string m_error_string;
+};
+
+#endif // NVBENCH_HAS_NVML
+
+} // namespace nvbench::nvml
+
+#ifdef NVBENCH_HAS_NVML
+
+#define NVBENCH_NVML_CALL(call)                                                \
+  do                                                                           \
+  {                                                                            \
+    const auto _rr = call;                                                     \
+    if (_rr != NVML_SUCCESS)                                                   \
+    {                                                                          \
+      throw nvbench::nvml::call_failed(__FILE__,                               \
+                                       __LINE__,                               \
+                                       #call,                                  \
+                                       _rr,                                    \
+                                       nvmlErrorString(_rr));                  \
+    }                                                                          \
+  } while (false)
+
+// Same as above, but used for nvmlInit(), where a failure means that
+// nvmlErrorString is not available.
+#define NVBENCH_NVML_CALL_NO_API(call)                                         \
+  do                                                                           \
+  {                                                                            \
+    const auto _rr = call;                                                     \
+    if (_rr != NVML_SUCCESS)                                                   \
+    {                                                                          \
+      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, "");    \
+    }                                                                          \
+  } while (false)
+
+#endif // NVBENCH_HAS_NVML
--- a/nvbench/internal/nvml.cxx
+++ b/nvbench/internal/nvml.cxx
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/internal/nvml.cuh>
+
+#include <nvbench/config.cuh>
+
+#include <fmt/format.h>
+
+#include <nvml.h>
+
+#include <stdexcept>
+
+namespace
+{
+
+// RAII struct that initializes and shuts down NVML
+struct NVMLLifetimeManager
+{
+  NVMLLifetimeManager()
+  {
+    try
+    {
+      NVBENCH_NVML_CALL_NO_API(nvmlInit());
+      m_inited = true;
+    }
+    catch (std::exception &e)
+    {
+      fmt::print("NVML initialization failed:\n {}", e.what());
+    }
+  }
+
+  ~NVMLLifetimeManager()
+  {
+    if (m_inited)
+    {
+      try
+      {
+        NVBENCH_NVML_CALL_NO_API(nvmlShutdown());
+      }
+      catch (std::exception &e)
+      {
+        fmt::print("NVML shutdown failed:\n {}", e.what());
+      }
+    }
+  }
+
+private:
+  bool m_inited{false};
+};
+
+// NVML's lifetime should extend for the entirety of the process, so store in a
+// global.
+auto nvml_lifetime = NVMLLifetimeManager{};
+
+} // namespace
--- a/nvbench/nvbench.cuh
+++ b/nvbench/nvbench.cuh
@@ -22,6 +22,7 @@
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/benchmark_manager.cuh>
 #include <nvbench/callable.cuh>
+#include <nvbench/config.cuh>
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/create.cuh>
 #include <nvbench/cuda_call.cuh>
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -252,6 +252,24 @@ std::vector<T> parse_values(std::string_view value_spec)
  }
 }

+std::vector<nvbench::device_info> parse_devices(std::string_view devices)
+{
+  auto &dev_mgr = nvbench::device_manager::get();
+
+  if (devices == "all")
+  {
+    return dev_mgr.get_devices();
+  }
+
+  std::vector<nvbench::device_info> result;
+  auto dev_ids = parse_values<nvbench::int32_t>(devices);
+  for (nvbench::int32_t dev_id : dev_ids)
+  {
+    result.push_back(dev_mgr.get_device(dev_id));
+  }
+  return result;
+}
+
 // Parse an axis specification into a 3-tuple of string_views containing the
 // axis name, flags, and values.
 auto parse_axis_key_flag_value_spec(const std::string &spec)
@@ -322,6 +340,9 @@ void option_parser::parse_impl()
 {
  m_global_benchmark_args.clear();

+  // Initialize to all devices:
+  m_recent_devices = nvbench::device_manager::get().get_devices();
+
  // Initialize color variable based on env var:
  {
    const char *var           = std::getenv("NVBENCH_COLOR");
@@ -330,6 +351,11 @@ void option_parser::parse_impl()

  this->parse_range(m_args.cbegin(), m_args.cend());

+  if (m_exit_after_parsing)
+  {
+    std::exit(0);
+  }
+
  if (m_benchmarks.empty())
  {
    // If no benchmarks were specified, add all:
@@ -397,6 +423,18 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
      this->print_list();
      std::exit(0);
    }
+    else if (arg == "--persistence-mode" || arg == "--pm")
+    {
+      check_params(1);
+      this->set_persistence_mode(first[1]);
+      first += 2;
+    }
+    else if (arg == "--lock-gpu-clocks" || arg == "--lgc")
+    {
+      check_params(1);
+      this->lock_gpu_clocks(first[1]);
+      first += 2;
+    }
    else if (arg == "--run-once")
    {
      this->enable_run_once();
@@ -569,6 +607,85 @@ void option_parser::print_help_axis() const
  fmt::print("{}\n", ::cli_help_axis_text);
 }

+void option_parser::set_persistence_mode(const std::string &state)
+try
+{
+  m_exit_after_parsing = true;
+
+  nvbench::int32_t state_val{};
+  ::parse(state, state_val);
+
+  for (nvbench::device_info &device : m_recent_devices)
+  {
+    fmt::print("Turning persistence mode {} for device '{}' ({}).\n",
+               static_cast<bool>(state_val) ? "ON" : "OFF",
+               device.get_name(),
+               device.get_id());
+    device.set_persistence_mode(static_cast<bool>(state_val));
+  }
+}
+catch (std::exception &e)
+{
+  NVBENCH_THROW(std::runtime_error,
+                "Error handling option `--persistence-mode {}`:\n{}",
+                state,
+                e.what());
+}
+
+void option_parser::lock_gpu_clocks(const std::string &rate)
+try
+{
+  m_exit_after_parsing = true;
+
+  nvbench::device_info::clock_rate rate_val;
+
+  if (rate == "reset" || rate == "unlock" || rate == "none")
+  {
+    rate_val = nvbench::device_info::clock_rate::none;
+  }
+  else if (rate == "base" || rate == "tdp")
+  {
+    rate_val = nvbench::device_info::clock_rate::base;
+  }
+  else if (rate == "max" || rate == "maximum")
+  {
+    rate_val = nvbench::device_info::clock_rate::maximum;
+  }
+  else
+  {
+    NVBENCH_THROW(std::runtime_error,
+                  "Unsupported argument: '{}'. Valid values are {}",
+                  rate,
+                  "{reset, base, max}");
+  }
+
+  for (nvbench::device_info &device : m_recent_devices)
+  {
+    if (rate_val == nvbench::device_info::clock_rate::none)
+    {
+      fmt::print("Unlocking clocks for device '{}' ({}).\n",
+                 device.get_name(),
+                 device.get_id());
+    }
+    else
+    {
+      fmt::print("Locking clocks to '{}' for device '{}' ({}).\n",
+                 rate,
+                 device.get_name(),
+                 device.get_id());
+    }
+
+    device.lock_gpu_clocks(rate_val);
+  }
+}
+catch (std::exception &e)
+{
+  NVBENCH_THROW(std::runtime_error,
+                "Error handling option `--lock-gpu-clocks {}`:\n{}",
+                rate,
+                e.what());
+}
+
 void option_parser::enable_run_once()
 {
  // If no active benchmark, save args as global.
@@ -606,7 +723,7 @@ try
 catch (std::exception &e)
 {
  NVBENCH_THROW(std::runtime_error,
-                "Error parsing --benchmark `{}`:\n{}",
+                "Error handling option --benchmark `{}`:\n{}",
                name,
                e.what());
 }
@@ -620,21 +737,26 @@ void option_parser::replay_global_args()
 void option_parser::update_devices(const std::string &devices)
 try
 {
+  auto device_vec = ::parse_devices(devices);
+
  // If no active benchmark, save args as global.
  if (m_benchmarks.empty())
  {
    m_global_benchmark_args.push_back("--devices");
    m_global_benchmark_args.push_back(devices);
-    return;
+  }
+  else
+  {
+    benchmark_base &bench = *m_benchmarks.back();
+    bench.set_devices(device_vec);
  }

-  benchmark_base &bench = *m_benchmarks.back();
-  bench.set_devices(parse_values<nvbench::int32_t>(devices));
+  m_recent_devices = std::move(device_vec);
 }
 catch (std::exception &e)
 {
  NVBENCH_THROW(std::runtime_error,
-                "Error parsing --devices `{}`:\n{}",
+                "Error handling option --devices `{}`:\n{}",
                devices,
                e.what());
 }
@@ -710,7 +832,7 @@ try
 catch (std::exception &e)
 {
  NVBENCH_THROW(std::runtime_error,
-                "Error parsing --axis `{}`:\n{}",
+                "Error handling option --axis `{}`:\n{}",
                spec,
                e.what());
 }
@@ -820,7 +942,7 @@ try
 catch (std::exception &e)
 {
  NVBENCH_THROW(std::runtime_error,
-                "Error parsing `{} {}`:\n{}",
+                "Error handling option `{} {}`:\n{}",
                prop_arg,
                prop_val,
                e.what());
@@ -866,7 +988,7 @@ try
 catch (std::exception &e)
 {
  NVBENCH_THROW(std::runtime_error,
-                "Error parsing `{} {}`:\n{}",
+                "Error handling option `{} {}`:\n{}",
                prop_arg,
                prop_val,
                e.what());
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -18,6 +18,7 @@

 #pragma once

+#include <nvbench/device_info.cuh>
 #include <nvbench/printer_multiplex.cuh>

 #include <iosfwd>
@@ -89,6 +90,9 @@ private:
  void print_help() const;
  void print_help_axis() const;

+  void set_persistence_mode(const std::string &state);
+  void lock_gpu_clocks(const std::string &rate);
+
  void enable_run_once();

  void add_benchmark(const std::string &name);
@@ -123,6 +127,11 @@ private:
  // Store benchmark modifiers passed in before any benchmarks are requested as
  // "global args". Replay them after every benchmark.
  std::vector<std::string> m_global_benchmark_args;
+
+  // List of devices specified by the most recent --devices option, or all
+  // devices if --devices has not been used.
+  std::vector<nvbench::device_info> m_recent_devices;
+
  benchmark_vector m_benchmarks;

  // Manages lifetimes of any ofstreams opened for m_printer.
@@ -136,6 +145,9 @@ private:

  // True if any stdout printers have been added to m_printer.
  bool m_have_stdout_printer{false};
+
+  // Used for device modification commands like --log-gpu-clocks
+  bool m_exit_after_parsing{false};
 };

 } // namespace nvbench