CUPTI support

2026-05-11 08:50:03 +00:00 · 2021-09-01 19:43:12 +03:00
parent 3d6c16f8ba
commit 1bc715267c
26 changed files with 1705 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 build/
+.idea
+cmake-build-*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ project(NVBench
 nvbench_init_rapids_cmake()

 option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
+option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ON)

 option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
 option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
--- a/cmake/NVBenchCUPTI.cmake
+++ b/cmake/NVBenchCUPTI.cmake
@@ -0,0 +1,71 @@
+# Since this file is installed, we need to make sure that the CUDAToolkit has
+# been found by consumers:
+if (NOT TARGET CUDA::toolkit)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+set(nvbench_cupti_root "${CUDAToolkit_LIBRARY_ROOT}/extras/CUPTI")
+
+# The CUPTI targets in FindCUDAToolkit are broken:
+# - The dll locations are not specified
+# - Dependent libraries nvperf_* are not linked.
+# So we create our own targets:
+function(nvbench_add_cupti_dep dep_name)
+  string(TOLOWER ${dep_name} dep_name_lower)
+  string(TOUPPER ${dep_name} dep_name_upper)
+
+  add_library(nvbench::${dep_name_lower} SHARED IMPORTED)
+
+  if (WIN32)
+    # Attempt to locate the dll in the expected location. This is necessary
+    # because the CUPTI dll has a versioned suffix, so we can't directly search
+    # for it with find_file.
+    file(GLOB dep_dll_path "${nvbench_cupti_root}/lib64/${dep_name_lower}*dll")
+    cmake_path(GET dep_dll_path FILENAME dep_dll_filename)
+
+    # If the dll was not found in the expected location, use a default filename as a user hint.
+    if (NOT dep_dll_filename)
+      set(dep_dll_filename ${dep_name_lower}.dll)
+    endif()
+
+    # Use find_file to create a cache variable and mark the file as REQUIRED.
+    find_file(NVBench_${dep_name_upper}_DLL ${dep_dll_filename} REQUIRED
+      DOC "The full path to ${dep_name_lower}.dll from the CUDA Toolkit."
+      HINTS "${nvbench_cupti_root}/lib64/"
+    )
+    mark_as_advanced(NVBench_${dep_name_upper}_DLL)
+
+    # The .libs don't have suffixes, so we can just directly search for them.
+    find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower}.lib REQUIRED
+      DOC "The full path to ${dep_name_lower}.lib from the CUDA Toolkit."
+      HINTS "${nvbench_cupti_root}/lib64/"
+    )
+    mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
+
+    set_target_properties(nvbench::${dep_name_lower} PROPERTIES
+      IMPORTED_LOCATION "${NVBench_${dep_name_upper}_DLL}"
+      IMPORTED_IMPLIB "${NVBench_${dep_name_upper}_LIBRARY}"
+    )
+  else()
+    find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED
+      DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit."
+      HINTS "${nvbench_cupti_root}/lib64"
+    )
+    mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
+
+    set_target_properties(nvbench::${dep_name_lower} PROPERTIES
+      IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}"
+    )
+  endif()
+endfunction()
+
+nvbench_add_cupti_dep(nvperf_target)
+nvbench_add_cupti_dep(nvperf_host)
+nvbench_add_cupti_dep(cupti)
+target_link_libraries(nvbench::cupti INTERFACE
+  nvbench::nvperf_target
+  nvbench::nvperf_host
+)
+target_include_directories(nvbench::cupti INTERFACE
+  "${nvbench_cupti_root}/include"
+)
--- a/cmake/NVBenchDependencies.cmake
+++ b/cmake/NVBenchDependencies.cmake
@@ -64,3 +64,10 @@ if (NVBench_ENABLE_NVML)
  include("${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake")
  list(APPEND ctk_libraries nvbench::nvml)
 endif()
+
+################################################################################
+# CUDAToolkit -> CUPTI
+if (NVBench_ENABLE_CUPTI)
+  include("${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake")
+  list(APPEND ctk_libraries CUDA::cuda_driver nvbench::cupti)
+endif()
--- a/cmake/NVBenchDependentDlls.cmake
+++ b/cmake/NVBenchDependentDlls.cmake
@@ -23,7 +23,9 @@ endif()
 function(nvbench_setup_dep_dlls target_name)
  # The custom command below fails when there aren't any runtime DLLs to copy,
  # so only enable it when a relevant dependency is enabled:
-  if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND NVBench_ENABLE_NVML)
+  if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND
+      (NVBench_ENABLE_NVML OR
+       NVBench_ENABLE_CUPTI))
    add_custom_command(TARGET ${target_name}
      POST_BUILD
      COMMAND
--- a/cmake/NVBenchExports.cmake
+++ b/cmake/NVBenchExports.cmake
@@ -11,6 +11,15 @@ macro(nvbench_generate_exports)
    )
  endif()

+  if (NVBench_ENABLE_CUPTI)
+    string(APPEND nvbench_build_export_code_block
+      "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n"
+    )
+    string(APPEND nvbench_install_export_code_block
+      "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n"
+    )
+  endif()
+
  rapids_export(BUILD NVBench
    EXPORT_SET nvbench-targets
    NAMESPACE "nvbench::"
--- a/cmake/NVBenchInstallRules.cmake
+++ b/cmake/NVBenchInstallRules.cmake
@@ -39,6 +39,14 @@ if (NVBench_ENABLE_NVML)
  )
 endif()

+if (NVBench_ENABLE_CUPTI)
+  install(
+    FILES
+      "${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake"
+    DESTINATION "${config_install_location}"
+  )
+endif()
+
 # Call with a list of library targets to generate install rules:
 function(nvbench_install_libraries)
  install(TARGETS ${ARGN}
--- a/cmake/NVBenchWriteConfigHeader.cmake
+++ b/cmake/NVBenchWriteConfigHeader.cmake
@@ -3,5 +3,9 @@ function(nvbench_write_config_header filepath)
    set(NVBENCH_HAS_NVML 1)
  endif()

+  if (NVBench_ENABLE_CUPTI)
+    set(NVBENCH_HAS_CUPTI 1)
+  endif()
+
  configure_file("${NVBench_SOURCE_DIR}/cmake/config.cuh.in" "${filepath}")
 endfunction()
--- a/cmake/config.cuh.in
+++ b/cmake/config.cuh.in
@@ -20,3 +20,6 @@

 // Defined if NVBench has been built with NVML support.
 #cmakedefine NVBENCH_HAS_NVML
+
+// Defined if NVBench has been built with CUPTI support.
+#cmakedefine NVBENCH_HAS_CUPTI
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -5,6 +5,7 @@ set(example_srcs
  exec_tag_timer.cu
  skip.cu
  throughput.cu
+  auto_throughput.cu
 )

 # Metatarget for all examples:
--- a/examples/auto_throughput.cu
+++ b/examples/auto_throughput.cu
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+template <int ItemsPerThread>
+__global__ void kernel(std::size_t stride,
+                       std::size_t elements,
+                       const nvbench::int32_t * __restrict__ in,
+                       nvbench::int32_t *__restrict__ out)
+{
+  const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const std::size_t step = gridDim.x * blockDim.x;
+
+  for (std::size_t i = stride * tid;
+       i < stride * elements;
+       i += stride * step)
+  {
+    for (int j = 0; j < ItemsPerThread; j++)
+    {
+      const auto read_id = (ItemsPerThread * i + j) % elements;
+      const auto write_id = tid + j * elements;
+      out[write_id] = in[read_id];
+    }
+  }
+}
+
+
+// `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
+// and cache hit rates.
+//
+// Calling state.collect_*() enables particular metric collection if nvbench
+// was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
+template <int ItemsPerThread>
+void throughput_bench(nvbench::state &state,
+                      nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
+{
+  // Allocate input data:
+  const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
+  const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
+  thrust::device_vector<nvbench::int32_t> input(elements);
+  thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
+
+  // Provide throughput information:
+  state.add_element_count(elements, "Elements");
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  const auto threads_in_block = 256;
+  const auto blocks_in_grid = (elements + threads_in_block - 1) /
+                              threads_in_block;
+
+  state.exec([&](nvbench::launch &launch) {
+    kernel<ItemsPerThread>
+      <<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
+        stride,
+        elements,
+        thrust::raw_pointer_cast(input.data()),
+        thrust::raw_pointer_cast(output.data()));
+  });
+}
+
+using items_per_thread = nvbench::enum_type_list<1, 2>;
+
+NVBENCH_BENCH_TYPES(throughput_bench, NVBENCH_TYPE_AXES(items_per_thread))
+  .add_int64_axis("Stride", nvbench::range(1, 4, 3));
--- a/nvbench/CMakeLists.txt
+++ b/nvbench/CMakeLists.txt
@@ -26,6 +26,10 @@ set(srcs
  detail/state_generator.cxx
 )

+if (NVBench_ENABLE_CUPTI)
+  list(APPEND srcs detail/measure_cupti.cu cupti_profiler.cxx)
+endif()
+
 if (NVBench_ENABLE_NVML)
  list(APPEND srcs internal/nvml.cxx)
 endif()
--- a/nvbench/cuda_call.cu
+++ b/nvbench/cuda_call.cu
@@ -17,6 +17,7 @@
 */

 #include <nvbench/cuda_call.cuh>
+#include <nvbench/config.cuh>

 #include <fmt/format.h>

@@ -44,6 +45,34 @@ void throw_error(const std::string &filename,
                                       command));
 }

+#ifdef NVBENCH_HAS_CUPTI
+void throw_error(const std::string &filename,
+                 std::size_t lineno,
+                 const std::string &command,
+                 CUresult error_code)
+{
+  const char *name = nullptr;
+  cuGetErrorName(error_code, &name);
+
+  const char *string = nullptr;
+  cuGetErrorString(error_code, &string);
+
+  throw std::runtime_error(fmt::format("{}:{}: Driver API call returned error: "
+                                       "{}: {}\nCommand: '{}'",
+                                       filename,
+                                       lineno,
+                                       name,
+                                       string,
+                                       command));
+}
+#else
+void throw_error(const std::string &,
+                 std::size_t,
+                 const std::string &,
+                 CUresult)
+{}
+#endif
+
 void exit_error(const std::string &filename,
                std::size_t lineno,
                const std::string &command,
--- a/nvbench/cuda_call.cuh
+++ b/nvbench/cuda_call.cuh
@@ -19,6 +19,7 @@
 #pragma once

 #include <cuda_runtime_api.h>
+#include <cuda.h>

 #include <string>

@@ -36,6 +37,20 @@
    }                                                                          \
  } while (false)

+/// Throws a std::runtime_error if `call` doesn't return `CUDA_SUCCESS`.
+#define NVBENCH_DRIVER_API_CALL(call)                                          \
+  do                                                                           \
+  {                                                                            \
+    const CUresult nvbench_cuda_call_error = call;                             \
+    if (nvbench_cuda_call_error != CUDA_SUCCESS)                               \
+    {                                                                          \
+      nvbench::cuda_call::throw_error(__FILE__,                                \
+                                      __LINE__,                                \
+                                      #call,                                   \
+                                      nvbench_cuda_call_error);                \
+    }                                                                          \
+  } while (false)
+
 /// Terminates process with failure status if `call` doesn't return
 /// `cudaSuccess`.
 #define NVBENCH_CUDA_CALL_NOEXCEPT(call)                                       \
@@ -59,6 +74,11 @@ void throw_error(const std::string &filename,
                 const std::string &call,
                 cudaError_t error);

+void throw_error(const std::string &filename,
+                 std::size_t lineno,
+                 const std::string &call,
+                 CUresult error);
+
 void exit_error(const std::string &filename,
                std::size_t lineno,
                const std::string &command,
--- a/nvbench/cupti_profiler.cuh
+++ b/nvbench/cupti_profiler.cuh
@@ -0,0 +1,129 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/config.cuh>
+#include <nvbench/device_info.cuh>
+
+#include <string>
+#include <vector>
+#include <optional>
+
+namespace nvbench::detail
+{
+
+
+#ifdef NVBENCH_HAS_CUPTI
+/**
+ * Pass required metrics in the constructor and organize your code as follows
+ * to get counters back:
+ *
+ * ```
+ * cupti_profiler cupti(
+ *      nvbench::device_info{0},
+ *      { "l1tex__t_sector_hit_rate.pct" });
+ *
+ * cupti->prepare_user_loop();
+ *
+ * do
+ * {
+ *   m_measure.m_cupti->start_user_loop();
+ *
+ *   kernel_1<<<1, 1>>>();
+ *   // ...
+ *   kernel_n<<<1, 1>>>();
+ *
+ *   m_measure.m_cupti->stop_user_loop();
+ * } while(cupti->is_replay_required());
+ *
+ * cupti->process_user_loop();
+ *
+ * auto result = m_cupti->get_counter_values();
+ * ```
+ *
+ * Check Perfworks Metric table here for the full list of metrics:
+ * https://docs.nvidia.com/cupti/r_main.html#metrics-reference-7x
+ */
+class cupti_profiler
+{
+  bool m_available {};
+  std::string m_chip_name;
+
+  // Counter data
+  std::vector<std::string> m_metric_names;
+  std::vector<std::uint8_t> m_data_image_prefix;
+  std::vector<std::uint8_t> m_config_image;
+  std::vector<std::uint8_t> m_data_image;
+  std::vector<std::uint8_t> m_data_scratch_buffer;
+  std::vector<std::uint8_t> m_availability_image;
+  nvbench::device_info m_device;
+
+  // CUPTI runs a series of replay passes, where each pass contains a sequence
+  // of ranges. Every metric enabled in the configuration is collected
+  // separately per unique range in the pass. CUPTI supports auto and
+  // user-defined ranges. With auto range mode, ranges are defined around each
+  // kernel automatically. In the user range mode, ranges are defined manually.
+  // We define a single user range for the whole measurement.
+  static const int m_num_ranges = 1;
+
+public:
+  // Move only
+  cupti_profiler(cupti_profiler &&) noexcept;
+  cupti_profiler &operator=(cupti_profiler &&) noexcept;
+
+  cupti_profiler(const cupti_profiler &) = delete;
+  cupti_profiler &operator=(const cupti_profiler &) = delete;
+
+  cupti_profiler(nvbench::device_info device,
+                 std::vector<std::string> &&metric_names);
+  ~cupti_profiler();
+
+  [[nodiscard]] bool is_initialized() const;
+
+  /// Should be called before replay loop
+  void prepare_user_loop();
+
+  /// Should be called before any kernel calls in the replay loop
+  void start_user_loop();
+
+  /// Should be called after all kernel calls in the replay loop
+  void stop_user_loop();
+
+  /// Should be called after the replay loop
+  void process_user_loop();
+
+  /// Indicates whether another iteration of the replay loop is required
+  [[nodiscard]] bool is_replay_required();
+
+  /// Returns counters for metrics requested in the constructor
+  [[nodiscard]] std::vector<double> get_counter_values();
+
+private:
+  void initialize_profiler();
+  void initialize_chip_name();
+  void initialize_availability_image();
+  static void initialize_nvpw();
+  void initialize_config_image();
+  void initialize_counter_data_prefix_image();
+  void initialize_counter_data_image();
+};
+#endif
+
+
+} // namespace nvbench::detail
--- a/nvbench/cupti_profiler.cxx
+++ b/nvbench/cupti_profiler.cxx
@@ -0,0 +1,775 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <cupti_profiler_target.h>
+#include <cupti_target.h>
+#include <nvbench/cupti_profiler.cuh>
+#include <nvbench/device_info.cuh>
+#include <nvperf_cuda_host.h>
+#include <nvperf_host.h>
+#include <nvperf_target.h>
+
+#include <fmt/format.h>
+
+#include <stdexcept>
+
+namespace nvbench::detail
+{
+
+namespace
+{
+
+void cupti_call(const CUptiResult status)
+{
+  if (status != CUPTI_SUCCESS)
+  {
+    const char *errstr{};
+    cuptiGetResultString(status, &errstr);
+
+    throw std::runtime_error(
+      fmt::format("CUPTI call returned error: {}\n", errstr));
+  }
+}
+
+void nvpw_call(const NVPA_Status status)
+{
+  if (status != NVPA_STATUS_SUCCESS)
+  {
+    throw std::runtime_error(
+      fmt::format("NVPW call returned error: {}\n", status));
+  }
+}
+
+} // namespace
+
+cupti_profiler::cupti_profiler(nvbench::device_info device,
+                               std::vector<std::string> &&metric_names)
+    : m_metric_names(metric_names)
+    , m_device(device)
+{
+  initialize_profiler();
+  initialize_chip_name();
+  initialize_availability_image();
+  initialize_nvpw();
+  initialize_config_image();
+  initialize_counter_data_prefix_image();
+  initialize_counter_data_image();
+
+  m_available = true;
+}
+
+cupti_profiler::cupti_profiler(cupti_profiler &&rhs) noexcept
+    : m_device(rhs.m_device.get_id(), rhs.m_device.get_cuda_device_prop())
+{
+  (*this) = std::move(rhs);
+}
+
+cupti_profiler &cupti_profiler::operator=(cupti_profiler &&rhs) noexcept
+{
+  m_device              = rhs.m_device;
+  m_available           = rhs.m_available;
+  m_chip_name           = std::move(rhs.m_chip_name);
+  m_metric_names        = std::move(rhs.m_metric_names);
+  m_data_image_prefix   = std::move(rhs.m_data_image_prefix);
+  m_config_image        = std::move(rhs.m_config_image);
+  m_data_image          = std::move(rhs.m_data_image);
+  m_data_scratch_buffer = std::move(rhs.m_data_scratch_buffer);
+  m_availability_image  = std::move(rhs.m_availability_image);
+
+  rhs.m_available = false;
+
+  return *this;
+}
+
+void cupti_profiler::initialize_profiler()
+{
+  if (!m_device.is_cupti_supported())
+  {
+    throw std::runtime_error(fmt::format("Device: {} isn't supported (CC {})",
+                                         m_device.get_id(),
+                                         m_device.get_sm_version()));
+  }
+
+  CUpti_Profiler_Initialize_Params params = {
+    CUpti_Profiler_Initialize_Params_STRUCT_SIZE};
+  cupti_call(cuptiProfilerInitialize(&params));
+}
+
+void cupti_profiler::initialize_chip_name()
+{
+  CUpti_Device_GetChipName_Params params = {
+    CUpti_Device_GetChipName_Params_STRUCT_SIZE};
+  params.deviceIndex = m_device.get_id();
+  cupti_call(cuptiDeviceGetChipName(&params));
+
+  m_chip_name = std::string(params.pChipName);
+}
+
+void cupti_profiler::initialize_availability_image()
+{
+  CUpti_Profiler_GetCounterAvailability_Params params = {
+    CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE};
+
+  params.ctx = m_device.get_context();
+
+  cupti_call(cuptiProfilerGetCounterAvailability(&params));
+
+  m_availability_image.clear();
+  m_availability_image.resize(params.counterAvailabilityImageSize);
+  params.pCounterAvailabilityImage = m_availability_image.data();
+
+  cupti_call(cuptiProfilerGetCounterAvailability(&params));
+}
+
+void cupti_profiler::initialize_nvpw()
+{
+  NVPW_InitializeHost_Params params = {NVPW_InitializeHost_Params_STRUCT_SIZE};
+
+  nvpw_call(NVPW_InitializeHost(&params));
+}
+
+namespace
+{
+
+class eval_request
+{
+  NVPW_MetricsEvaluator *evaluator_ptr;
+
+public:
+  eval_request(NVPW_MetricsEvaluator *evaluator_ptr,
+               const std::string &metric_name)
+      : evaluator_ptr(evaluator_ptr)
+  {
+    NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params = {
+      NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE};
+
+    params.pMetricsEvaluator           = evaluator_ptr;
+    params.pMetricName                 = metric_name.c_str();
+    params.pMetricEvalRequest          = &request;
+    params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
+
+    nvpw_call(
+      NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
+  }
+
+  [[nodiscard]] std::vector<const char *> get_raw_dependencies()
+  {
+    std::vector<const char *> raw_dependencies;
+
+    NVPW_MetricsEvaluator_GetMetricRawDependencies_Params params = {
+      NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE};
+
+    params.pMetricsEvaluator           = evaluator_ptr;
+    params.pMetricEvalRequests         = &request;
+    params.numMetricEvalRequests       = 1;
+    params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
+    params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
+
+    nvpw_call(NVPW_MetricsEvaluator_GetMetricRawDependencies(&params));
+
+    raw_dependencies.resize(params.numRawDependencies);
+    params.ppRawDependencies = raw_dependencies.data();
+
+    nvpw_call(NVPW_MetricsEvaluator_GetMetricRawDependencies(&params));
+
+    return raw_dependencies;
+  }
+
+  NVPW_MetricEvalRequest request;
+};
+
+class metric_evaluator
+{
+  bool initialized{};
+  NVPW_MetricsEvaluator *evaluator_ptr;
+  std::vector<std::uint8_t> scratch_buffer;
+
+public:
+  metric_evaluator(const std::string &chip_name,
+                   const std::uint8_t *counter_availability_image = nullptr,
+                   const std::uint8_t *counter_data_image         = nullptr,
+                   const std::size_t counter_data_image_size      = 0)
+  {
+    NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
+      scratch_buffer_param = {
+        NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE};
+
+    scratch_buffer_param.pChipName                 = chip_name.c_str();
+    scratch_buffer_param.pCounterAvailabilityImage = counter_availability_image;
+
+    nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(
+      &scratch_buffer_param));
+
+    scratch_buffer.resize(scratch_buffer_param.scratchBufferSize);
+
+    NVPW_CUDA_MetricsEvaluator_Initialize_Params evaluator_params = {
+      NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE};
+
+    evaluator_params.scratchBufferSize         = scratch_buffer.size();
+    evaluator_params.pScratchBuffer            = scratch_buffer.data();
+    evaluator_params.pChipName                 = chip_name.c_str();
+    evaluator_params.pCounterAvailabilityImage = counter_availability_image;
+    evaluator_params.pCounterDataImage         = counter_data_image;
+    evaluator_params.counterDataImageSize      = counter_data_image_size;
+
+    nvpw_call(NVPW_CUDA_MetricsEvaluator_Initialize(&evaluator_params));
+
+    evaluator_ptr = evaluator_params.pMetricsEvaluator;
+    initialized   = true;
+  }
+
+  ~metric_evaluator()
+  {
+    if (initialized)
+    {
+      NVPW_MetricsEvaluator_Destroy_Params params = {
+        NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE};
+
+      params.pMetricsEvaluator = evaluator_ptr;
+
+      nvpw_call(NVPW_MetricsEvaluator_Destroy(&params));
+    }
+  }
+
+  [[nodiscard]] eval_request create_request(const std::string &metric_name)
+  {
+    return {evaluator_ptr, metric_name};
+  }
+
+  [[nodiscard]] operator NVPW_MetricsEvaluator *() const
+  {
+    return evaluator_ptr;
+  }
+};
+
+} // namespace
+
+namespace
+{
+
+[[nodiscard]] std::vector<NVPA_RawMetricRequest> get_raw_metric_requests(
+  const std::string &chip_name,
+  const std::vector<std::string> &metric_names,
+  const std::uint8_t *counter_availability_image = nullptr)
+{
+  metric_evaluator evaluator(chip_name, counter_availability_image);
+
+  std::vector<const char *> raw_metric_names;
+  raw_metric_names.reserve(metric_names.size());
+
+  for (auto &metric_name : metric_names)
+  {
+    for (auto &raw_dependency :
+         evaluator.create_request(metric_name).get_raw_dependencies())
+    {
+      raw_metric_names.push_back(raw_dependency);
+    }
+  }
+
+  std::vector<NVPA_RawMetricRequest> raw_requests;
+  raw_requests.reserve(raw_metric_names.size());
+
+  for (auto &raw_name : raw_metric_names)
+  {
+    NVPA_RawMetricRequest metricRequest = {NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE};
+    metricRequest.pMetricName           = raw_name;
+    metricRequest.isolated              = true;
+    metricRequest.keepInstances         = true;
+    raw_requests.push_back(metricRequest);
+  }
+
+  return raw_requests;
+}
+
+class metrics_config
+{
+  bool initialized{};
+
+  void create(const std::string &chip_name,
+              const std::uint8_t *availability_image)
+  {
+    NVPW_CUDA_RawMetricsConfig_Create_V2_Params params = {
+      NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE};
+
+    params.activityKind              = NVPA_ACTIVITY_KIND_PROFILER;
+    params.pChipName                 = chip_name.c_str();
+    params.pCounterAvailabilityImage = availability_image;
+
+    nvpw_call(NVPW_CUDA_RawMetricsConfig_Create_V2(&params));
+
+    raw_metrics_config = params.pRawMetricsConfig;
+    initialized        = true;
+  }
+
+  void set_availability_image(const std::uint8_t *availability_image)
+  {
+    NVPW_RawMetricsConfig_SetCounterAvailability_Params params = {
+      NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE};
+
+    params.pRawMetricsConfig         = raw_metrics_config;
+    params.pCounterAvailabilityImage = availability_image;
+
+    nvpw_call(NVPW_RawMetricsConfig_SetCounterAvailability(&params));
+  }
+
+  void begin_config_group()
+  {
+    NVPW_RawMetricsConfig_BeginPassGroup_Params params = {
+      NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE};
+
+    params.pRawMetricsConfig = raw_metrics_config;
+
+    nvpw_call(NVPW_RawMetricsConfig_BeginPassGroup(&params));
+  }
+
+  void add_metrics(const std::vector<NVPA_RawMetricRequest> &raw_metric_requests)
+  {
+    NVPW_RawMetricsConfig_AddMetrics_Params params = {
+      NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE};
+
+    params.pRawMetricsConfig  = raw_metrics_config;
+    params.pRawMetricRequests = raw_metric_requests.data();
+    params.numMetricRequests  = raw_metric_requests.size();
+
+    nvpw_call(NVPW_RawMetricsConfig_AddMetrics(&params));
+  }
+
+  void end_config_group()
+  {
+    NVPW_RawMetricsConfig_EndPassGroup_Params params = {
+      NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE};
+
+    params.pRawMetricsConfig = raw_metrics_config;
+
+    nvpw_call(NVPW_RawMetricsConfig_EndPassGroup(&params));
+  }
+
+  void generate()
+  {
+    NVPW_RawMetricsConfig_GenerateConfigImage_Params params = {
+      NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE};
+
+    params.pRawMetricsConfig = raw_metrics_config;
+
+    nvpw_call(NVPW_RawMetricsConfig_GenerateConfigImage(&params));
+  }
+
+public:
+  metrics_config(const std::string &chip_name,
+                 const std::vector<NVPA_RawMetricRequest> &raw_metric_requests,
+                 const std::uint8_t *availability_image)
+  {
+    create(chip_name, availability_image);
+    set_availability_image(availability_image);
+
+    begin_config_group();
+    add_metrics(raw_metric_requests);
+    end_config_group();
+    generate();
+  }
+
+  [[nodiscard]] std::vector<std::uint8_t> get_config_image()
+  {
+    NVPW_RawMetricsConfig_GetConfigImage_Params params = {
+      NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE};
+
+    params.pRawMetricsConfig = raw_metrics_config;
+    params.bytesAllocated    = 0;
+    params.pBuffer           = nullptr;
+
+    nvpw_call(NVPW_RawMetricsConfig_GetConfigImage(&params));
+
+    std::vector<std::uint8_t> config_image(params.bytesCopied);
+    params.bytesAllocated = config_image.size();
+    params.pBuffer        = config_image.data();
+
+    nvpw_call(NVPW_RawMetricsConfig_GetConfigImage(&params));
+    return config_image;
+  }
+
+  ~metrics_config()
+  {
+    if (initialized)
+    {
+      NVPW_RawMetricsConfig_Destroy_Params params = {
+        NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE};
+
+      params.pRawMetricsConfig = raw_metrics_config;
+
+      NVPW_RawMetricsConfig_Destroy(&params);
+    }
+  }
+
+  NVPA_RawMetricsConfig *raw_metrics_config;
+};
+
+} // namespace
+
+void cupti_profiler::initialize_config_image()
+{
+  m_config_image =
+    metrics_config(m_chip_name,
+                   get_raw_metric_requests(m_chip_name,
+                                           m_metric_names,
+                                           m_availability_image.data()),
+                   m_availability_image.data())
+      .get_config_image();
+}
+
+namespace
+{
+
+class counter_data_builder
+{
+  bool initialized{};
+
+public:
+  counter_data_builder(const std::string &chip_name,
+                       const std::uint8_t *pCounterAvailabilityImage)
+  {
+    NVPW_CUDA_CounterDataBuilder_Create_Params params = {
+      NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE};
+
+    params.pChipName                 = chip_name.c_str();
+    params.pCounterAvailabilityImage = pCounterAvailabilityImage;
+
+    nvpw_call(NVPW_CUDA_CounterDataBuilder_Create(&params));
+
+    builder     = params.pCounterDataBuilder;
+    initialized = true;
+  }
+
+  ~counter_data_builder()
+  {
+    if (initialized)
+    {
+      NVPW_CounterDataBuilder_Destroy_Params params = {
+        NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE};
+
+      params.pCounterDataBuilder = builder;
+
+      NVPW_CounterDataBuilder_Destroy(&params);
+    }
+  }
+
+  NVPA_CounterDataBuilder *builder;
+};
+
+} // namespace
+
+void cupti_profiler::initialize_counter_data_prefix_image()
+{
+  const std::uint8_t *counter_availability_image = nullptr;
+
+  std::vector<NVPA_RawMetricRequest> raw_metric_requests =
+    get_raw_metric_requests(m_chip_name,
+                            m_metric_names,
+                            counter_availability_image);
+
+  counter_data_builder data_builder(m_chip_name, counter_availability_image);
+
+  {
+    NVPW_CounterDataBuilder_AddMetrics_Params params = {
+      NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE};
+
+    params.pCounterDataBuilder = data_builder.builder;
+    params.pRawMetricRequests  = raw_metric_requests.data();
+    params.numMetricRequests   = raw_metric_requests.size();
+
+    nvpw_call(NVPW_CounterDataBuilder_AddMetrics(&params));
+  }
+
+  {
+    NVPW_CounterDataBuilder_GetCounterDataPrefix_Params params = {
+      NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE};
+
+    params.pCounterDataBuilder = data_builder.builder;
+    params.bytesAllocated      = 0;
+    params.pBuffer             = nullptr;
+
+    nvpw_call(NVPW_CounterDataBuilder_GetCounterDataPrefix(&params));
+
+    m_data_image_prefix.resize(params.bytesCopied);
+    params.bytesAllocated = m_data_image_prefix.size();
+    params.pBuffer        = m_data_image_prefix.data();
+
+    nvpw_call(NVPW_CounterDataBuilder_GetCounterDataPrefix(&params));
+  }
+}
+
+namespace
+{
+
+[[nodiscard]] std::size_t
+get_counter_data_image_size(CUpti_Profiler_CounterDataImageOptions *options)
+{
+  CUpti_Profiler_CounterDataImage_CalculateSize_Params params = {
+    CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE};
+
+  params.pOptions = options;
+  params.sizeofCounterDataImageOptions =
+    CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
+
+  cupti_call(cuptiProfilerCounterDataImageCalculateSize(&params));
+  return params.counterDataImageSize;
+}
+
+} // namespace
+
+void cupti_profiler::initialize_counter_data_image()
+{
+  CUpti_Profiler_CounterDataImageOptions counter_data_image_options;
+
+  counter_data_image_options.pCounterDataPrefix    = &m_data_image_prefix[0];
+  counter_data_image_options.counterDataPrefixSize = m_data_image_prefix.size();
+  counter_data_image_options.maxNumRanges          = m_num_ranges;
+  counter_data_image_options.maxNumRangeTreeNodes  = m_num_ranges;
+  counter_data_image_options.maxRangeNameLength    = 64;
+
+  m_data_image.resize(get_counter_data_image_size(&counter_data_image_options));
+
+  {
+    CUpti_Profiler_CounterDataImage_Initialize_Params params = {
+      CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE};
+
+    params.sizeofCounterDataImageOptions =
+      CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
+    params.pOptions             = &counter_data_image_options;
+    params.counterDataImageSize = m_data_image.size();
+
+    params.pCounterDataImage = &m_data_image[0];
+    cupti_call(cuptiProfilerCounterDataImageInitialize(&params));
+  }
+
+  {
+    CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params params = {
+      CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE};
+
+    params.counterDataImageSize = m_data_image.size();
+    params.pCounterDataImage    = &m_data_image[0];
+
+    cupti_call(
+      cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
+
+    m_data_scratch_buffer.resize(params.counterDataScratchBufferSize);
+  }
+
+  {
+    CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params params = {
+      CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE};
+
+    params.counterDataImageSize         = m_data_image.size();
+    params.pCounterDataImage            = &m_data_image[0];
+    params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
+    params.pCounterDataScratchBuffer    = &m_data_scratch_buffer[0];
+
+    cupti_call(cuptiProfilerCounterDataImageInitializeScratchBuffer(&params));
+  }
+}
+
+cupti_profiler::~cupti_profiler()
+{
+  if (is_initialized())
+  {
+    CUpti_Profiler_DeInitialize_Params params = {
+      CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE};
+
+    cuptiProfilerDeInitialize(&params);
+  }
+}
+
+bool cupti_profiler::is_initialized() const
+{
+  return m_available;
+}
+
+void cupti_profiler::prepare_user_loop()
+{
+  {
+    CUpti_Profiler_BeginSession_Params params = {
+      CUpti_Profiler_BeginSession_Params_STRUCT_SIZE};
+
+    params.ctx                          = nullptr;
+    params.counterDataImageSize         = m_data_image.size();
+    params.pCounterDataImage            = &m_data_image[0];
+    params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
+    params.pCounterDataScratchBuffer    = &m_data_scratch_buffer[0];
+
+    // Each kernel is going to produce its own set of metrics
+    params.range              = CUPTI_UserRange;
+    params.replayMode         = CUPTI_UserReplay;
+    params.maxRangesPerPass   = m_num_ranges;
+    params.maxLaunchesPerPass = m_num_ranges;
+
+    cupti_call(cuptiProfilerBeginSession(&params));
+  }
+
+  {
+    CUpti_Profiler_SetConfig_Params params = {
+      CUpti_Profiler_SetConfig_Params_STRUCT_SIZE};
+
+    params.pConfig          = &m_config_image[0];
+    params.configSize       = m_config_image.size();
+    params.minNestingLevel  = 1;
+    params.numNestingLevels = 1;
+    params.passIndex        = 0;
+
+    cupti_call(cuptiProfilerSetConfig(&params));
+  }
+}
+
+void cupti_profiler::start_user_loop()
+{
+  {
+    CUpti_Profiler_BeginPass_Params params = {
+      CUpti_Profiler_BeginPass_Params_STRUCT_SIZE};
+
+    cupti_call(cuptiProfilerBeginPass(&params));
+  }
+
+  {
+    CUpti_Profiler_EnableProfiling_Params params = {
+      CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE};
+
+    cupti_call(cuptiProfilerEnableProfiling(&params));
+  }
+
+  {
+    CUpti_Profiler_PushRange_Params params = {
+      CUpti_Profiler_PushRange_Params_STRUCT_SIZE};
+
+    std::string rangeName = "nvbench";
+    params.pRangeName     = rangeName.c_str();
+
+    cupti_call(cuptiProfilerPushRange(&params));
+  }
+}
+
+void cupti_profiler::stop_user_loop()
+{
+  {
+    CUpti_Profiler_PopRange_Params params = {
+      CUpti_Profiler_PopRange_Params_STRUCT_SIZE};
+
+    cupti_call(cuptiProfilerPopRange(&params));
+  }
+
+  {
+    CUpti_Profiler_DisableProfiling_Params params = {
+      CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE};
+
+    cupti_call(cuptiProfilerDisableProfiling(&params));
+  }
+}
+
+bool cupti_profiler::is_replay_required()
+{
+  CUpti_Profiler_EndPass_Params params = {
+    CUpti_Profiler_EndPass_Params_STRUCT_SIZE};
+
+  cupti_call(cuptiProfilerEndPass(&params));
+
+  return !params.allPassesSubmitted;
+}
+
+void cupti_profiler::process_user_loop()
+{
+  {
+    CUpti_Profiler_FlushCounterData_Params params = {
+      CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE};
+
+    cupti_call(cuptiProfilerFlushCounterData(&params));
+  }
+
+  {
+    CUpti_Profiler_UnsetConfig_Params params = {
+      CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE};
+
+    cupti_call(cuptiProfilerUnsetConfig(&params));
+  }
+
+  {
+    CUpti_Profiler_EndSession_Params params = {
+      CUpti_Profiler_EndSession_Params_STRUCT_SIZE};
+
+    cupti_call(cuptiProfilerEndSession(&params));
+  }
+}
+
+std::vector<double> cupti_profiler::get_counter_values()
+{
+  metric_evaluator evaluator(m_chip_name,
+                             m_availability_image.data(),
+                             m_data_image.data(),
+                             m_data_image.size());
+
+  {
+    NVPW_CounterData_GetNumRanges_Params params = {
+      NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE};
+
+    params.pCounterDataImage = m_data_image.data();
+    nvpw_call(NVPW_CounterData_GetNumRanges(&params));
+
+    if (params.numRanges != 1)
+    {
+      throw std::runtime_error("Something's gone wrong, one range is expected");
+    }
+  }
+
+  std::size_t range_id{}; // there's only one range
+  std::size_t result_id{};
+  std::vector<double> result(m_metric_names.size());
+
+  for (const std::string &metric_name : m_metric_names)
+  {
+    eval_request request = evaluator.create_request(metric_name);
+
+    {
+      NVPW_MetricsEvaluator_SetDeviceAttributes_Params params = {
+        NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE};
+
+      params.pMetricsEvaluator    = evaluator;
+      params.pCounterDataImage    = m_data_image.data();
+      params.counterDataImageSize = m_data_image.size();
+
+      nvpw_call(NVPW_MetricsEvaluator_SetDeviceAttributes(&params));
+    }
+
+    {
+      NVPW_MetricsEvaluator_EvaluateToGpuValues_Params params = {
+        NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE};
+
+      params.pMetricsEvaluator           = evaluator;
+      params.pMetricEvalRequests         = &request.request;
+      params.numMetricEvalRequests       = 1;
+      params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
+      params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
+      params.pCounterDataImage           = m_data_image.data();
+      params.counterDataImageSize        = m_data_image.size();
+      params.rangeIndex                  = range_id;
+      params.isolated                    = true;
+      params.pMetricValues               = &result[result_id++];
+
+      nvpw_call(NVPW_MetricsEvaluator_EvaluateToGpuValues(&params));
+    }
+  }
+
+  return result;
+}
+
+} // namespace nvbench::detail
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -42,7 +42,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
    , m_min_time{exec_state.get_min_time()}
    , m_skip_time{exec_state.get_skip_time()}
    , m_timeout{exec_state.get_timeout()}
-{}
+{ }

 void measure_cold_base::check()
 {
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -185,6 +185,7 @@ private:
    }

    kernel_launch_timer<use_blocking_kernel> timer(*this);
+
    this->launch_kernel(timer);
    this->check_skip_time(m_cuda_timer.get_duration());
  }
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -0,0 +1,263 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/benchmark_base.cuh>
+#include <nvbench/detail/measure_cupti.cuh>
+#include <nvbench/detail/throw.cuh>
+#include <nvbench/printer_base.cuh>
+#include <nvbench/state.cuh>
+
+#include <fmt/format.h>
+
+#include <algorithm>
+#include <stdexcept>
+
+namespace nvbench::detail
+{
+
+namespace
+{
+
+enum class metric_id : int
+{
+  dram_peak_sustained_throughput = 0,
+  global_load_efficiency,
+  global_store_efficiency,
+  l1_hit_rate,
+  l2_hit_rate,
+
+  count
+};
+
+template <metric_id id>
+struct metric_traits;
+
+template <>
+struct metric_traits<metric_id::dram_peak_sustained_throughput>
+{
+  static constexpr const char *metric_name =
+    "dram__throughput.avg.pct_of_peak_sustained_elapsed";
+
+  static constexpr const char *summary =
+    "Peak Sustained Global Memory Throughput (HW)";
+
+  static constexpr const char *hint       = "percentage";
+  static constexpr const char *short_name = "HBWPeak";
+
+  static constexpr const char *description =
+    "The utilization level of the device memory relative to the peak "
+    "utilization.";
+
+  static constexpr double divider = 100.0;
+
+  static bool is_collected(nvbench::state &m_state)
+  {
+    return m_state.is_dram_throughput_collected();
+  };
+};
+
+template <>
+struct metric_traits<metric_id::global_load_efficiency>
+{
+  static constexpr const char *metric_name =
+    "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct";
+
+  static constexpr const char *summary    = "Global Load Efficiency (HW)";
+  static constexpr const char *hint       = "percentage";
+  static constexpr const char *short_name = "LoadEff";
+
+  static constexpr const char *description =
+    "Ratio of requested global memory load throughput to required global "
+    "memory load throughput expressed as percentage.";
+
+  static constexpr double divider = 100.0;
+
+  static bool is_collected(nvbench::state &m_state)
+  {
+    return m_state.is_loads_efficiency_collected();
+  };
+};
+
+template <>
+struct metric_traits<metric_id::global_store_efficiency>
+{
+  static constexpr const char *metric_name =
+    "smsp__sass_average_data_bytes_per_sector_mem_global_op_st.pct";
+
+  static constexpr const char *summary    = "Global Store Efficiency (HW)";
+  static constexpr const char *hint       = "percentage";
+  static constexpr const char *short_name = "StoreEff";
+
+  static constexpr const char *description =
+    "Ratio of requested global memory store throughput to required global "
+    "memory store throughput expressed as percentage.";
+
+  static constexpr double divider = 100.0;
+
+  static bool is_collected(nvbench::state &m_state)
+  {
+    return m_state.is_stores_efficiency_collected();
+  };
+};
+
+template <>
+struct metric_traits<metric_id::l1_hit_rate>
+{
+  static constexpr const char *metric_name = "l1tex__t_sector_hit_rate.pct";
+  static constexpr const char *summary     = "L1 Cache Hit Rate (HW)";
+  static constexpr const char *hint        = "percentage";
+  static constexpr const char *short_name  = "L1HitRate";
+  static constexpr const char *description = "Hit rate at L1 cache.";
+  static constexpr double divider          = 100.0;
+
+  static bool is_collected(nvbench::state &m_state)
+  {
+    return m_state.is_l1_hit_rate_collected();
+  };
+};
+
+template <>
+struct metric_traits<metric_id::l2_hit_rate>
+{
+  static constexpr const char *metric_name = "lts__t_sector_hit_rate.pct";
+  static constexpr const char *summary     = "L2 Cache Hit Rate (HW)";
+  static constexpr const char *hint        = "percentage";
+  static constexpr const char *short_name  = "L2HitRate";
+  static constexpr const char *description = "Hit rate at L2 cache.";
+  static constexpr double divider          = 100.0;
+
+  static bool is_collected(nvbench::state &m_state)
+  {
+    return m_state.is_l2_hit_rate_collected();
+  };
+};
+
+template <metric_id id = metric_id::dram_peak_sustained_throughput>
+void add_metrics_impl(nvbench::state &state, std::vector<std::string> &metrics)
+{
+  if (metric_traits<id>::is_collected(state))
+  {
+    metrics.emplace_back(metric_traits<id>::metric_name);
+  }
+
+  constexpr auto next_id = static_cast<metric_id>(static_cast<int>(id) + 1);
+  add_metrics_impl<next_id>(state, metrics);
+}
+
+template <>
+void add_metrics_impl<metric_id::count>(nvbench::state &,
+                                        std::vector<std::string> &)
+{}
+
+std::vector<std::string> add_metrics(nvbench::state &state)
+{
+  std::vector<std::string> metrics;
+  metrics.reserve(static_cast<int>(metric_id::count));
+
+  add_metrics_impl(state, metrics);
+  return metrics;
+}
+
+} // namespace
+
+measure_cupti_base::measure_cupti_base(state &exec_state)
+try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state))
+{}
+catch (const std::exception &ex)
+{
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
+      printer_opt_ref)
+  {
+    auto &printer = printer_opt_ref.value().get();
+    printer.log(nvbench::log_level::warn,
+                fmt::format("CUPTI failed to construct profiler: {}",
+                            ex.what()));
+  }
+}
+
+void measure_cupti_base::check()
+{
+  const auto device = m_state.get_device();
+  if (!device)
+  {
+    NVBENCH_THROW(std::runtime_error,
+                  "{}",
+                  "Device required for `cupti` measurement.");
+  }
+  if (!device->is_active())
+  { // This means something went wrong higher up. Throw an error.
+    NVBENCH_THROW(std::runtime_error,
+                  "{}",
+                  "Internal error: Current device is not active.");
+  }
+}
+
+namespace
+{
+
+template <metric_id id = metric_id::dram_peak_sustained_throughput>
+void gen_summary(std::size_t result_id,
+                 nvbench::state &m_state,
+                 const std::vector<double> &result)
+{
+  using metric = metric_traits<id>;
+
+  if (metric::is_collected(m_state))
+  {
+    auto &summ = m_state.add_summary(metric::summary);
+    summ.set_string("hint", metric::hint);
+    summ.set_string("short_name", metric::short_name);
+    summ.set_string("description", metric::description);
+    summ.set_float64("value", result[result_id++] / metric::divider);
+  }
+
+  constexpr auto next_id = static_cast<metric_id>(static_cast<int>(id) + 1);
+  gen_summary<next_id>(result_id, m_state, result);
+}
+
+template <>
+void gen_summary<metric_id::count>(std::size_t,
+                                   nvbench::state &,
+                                   const std::vector<double> &)
+{}
+
+void gen_summaries(nvbench::state &state, const std::vector<double> &result)
+{
+  gen_summary(0, state, result);
+}
+
+} // namespace
+
+void measure_cupti_base::generate_summaries()
+try
+{
+  gen_summaries(m_state, m_cupti.get_counter_values());
+}
+catch (const std::exception &ex)
+{
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
+      printer_opt_ref)
+  {
+    auto &printer = printer_opt_ref.value().get();
+    printer.log(nvbench::log_level::warn,
+                fmt::format("CUPTI failed to generate the summary: {}",
+                            ex.what()));
+  }
+}
+
+} // namespace nvbench::detail
--- a/nvbench/detail/measure_cupti.cuh
+++ b/nvbench/detail/measure_cupti.cuh
@@ -0,0 +1,148 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/blocking_kernel.cuh>
+#include <nvbench/config.cuh>
+#include <nvbench/cpu_timer.cuh>
+#include <nvbench/cuda_call.cuh>
+#include <nvbench/cuda_timer.cuh>
+#include <nvbench/cupti_profiler.cuh>
+#include <nvbench/device_info.cuh>
+#include <nvbench/exec_tag.cuh>
+#include <nvbench/launch.cuh>
+
+#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
+#include <nvbench/detail/l2flush.cuh>
+#include <nvbench/detail/statistics.cuh>
+
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+namespace nvbench
+{
+
+struct state;
+
+namespace detail
+{
+
+// non-templated code goes here:
+struct measure_cupti_base
+{
+  explicit measure_cupti_base(nvbench::state &exec_state);
+  measure_cupti_base(const measure_cupti_base &) = delete;
+  measure_cupti_base(measure_cupti_base &&)      = delete;
+  measure_cupti_base &operator=(const measure_cupti_base &) = delete;
+  measure_cupti_base &operator=(measure_cupti_base &&) = delete;
+
+protected:
+  struct kernel_launch_timer;
+
+  void check();
+  void generate_summaries();
+
+  __forceinline__ void flush_device_l2()
+  {
+    m_l2flush.flush(m_launch.get_stream());
+  }
+
+  __forceinline__ void sync_stream() const
+  {
+    NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream()));
+  }
+
+  nvbench::state &m_state;
+
+  nvbench::launch m_launch;
+  nvbench::detail::l2flush m_l2flush;
+
+  cupti_profiler m_cupti;
+};
+
+struct measure_cupti_base::kernel_launch_timer
+{
+  explicit kernel_launch_timer(measure_cupti_base &measure)
+      : m_measure{measure}
+  {}
+
+  __forceinline__ void start()
+  {
+    m_measure.flush_device_l2();
+    m_measure.sync_stream();
+
+    if (m_measure.m_cupti.is_initialized())
+    {
+      m_measure.m_cupti.start_user_loop();
+    }
+  }
+
+  __forceinline__ void stop()
+  {
+    if (m_measure.m_cupti.is_initialized())
+    {
+      m_measure.m_cupti.stop_user_loop();
+    }
+
+    m_measure.sync_stream();
+  }
+
+private:
+  measure_cupti_base &m_measure;
+};
+
+template <typename KernelLauncher>
+struct measure_cupti : public measure_cupti_base
+{
+  measure_cupti(nvbench::state &state, KernelLauncher &kernel_launcher)
+      : measure_cupti_base(state)
+      , m_kernel_launcher{kernel_launcher}
+  {}
+
+  void operator()()
+  {
+    this->check();
+    this->run();
+    this->generate_summaries();
+  }
+
+private:
+  // Run the kernel as many times as CUPTI requires.
+  void run()
+  {
+    kernel_launch_timer timer(*this);
+
+    m_cupti.prepare_user_loop();
+
+    do
+    {
+      m_kernel_launcher(m_launch, timer);
+    } while (m_cupti.is_replay_required());
+
+    m_cupti.process_user_loop();
+  }
+
+  KernelLauncher &m_kernel_launcher;
+};
+
+} // namespace detail
+} // namespace nvbench
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -23,10 +23,14 @@
       "Do not include it directly."
 #endif // NVBENCH_STATE_EXEC_GUARD

+#include <nvbench/config.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/state.cuh>

 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
+#ifdef NVBENCH_HAS_CUPTI
+#include <nvbench/detail/measure_cupti.cuh>
+#endif // NVBENCH_HAS_CUPTI
 #include <nvbench/detail/measure_cold.cuh>
 #include <nvbench/detail/measure_hot.cuh>

@@ -83,6 +87,19 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    constexpr bool use_blocking_kernel = !(tags & no_block);
    if constexpr (tags & timer)
    {
+      // Estimate bandwidth here
+      #ifdef NVBENCH_HAS_CUPTI
+      if constexpr (!(modifier_tags & run_once))
+      {
+        if (this->is_cupti_required())
+        {
+          using measure_t = nvbench::detail::measure_cupti<KL>;
+          measure_t measure{*this, kernel_launcher};
+          measure();
+        }
+      }
+      #endif
+
      using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
      measure_t measure{*this, kernel_launcher};
      measure();
@@ -90,9 +107,23 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    else
    { // Need to wrap the kernel launcher with a timer wrapper:
      using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
+      wrapper_t wrapper{kernel_launcher};
+
+      // Estimate bandwidth here
+      #ifdef NVBENCH_HAS_CUPTI
+      if constexpr (!(modifier_tags & run_once))
+      {
+        if (this->is_cupti_required())
+        {
+          using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
+          measure_t measure{*this, wrapper};
+          measure();
+        }
+      }
+      #endif
+
      using measure_t =
        nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
-      wrapper_t wrapper{kernel_launcher};
      measure_t measure(*this, wrapper);
      measure();
    }
--- a/nvbench/device_info.cu
+++ b/nvbench/device_info.cu
@@ -144,4 +144,20 @@ catch (nvml::call_failed &e)
 }
 #endif // NVBENCH_HAS_NVML

+#ifdef NVBENCH_HAS_CUPTI
+[[nodiscard]] CUcontext device_info::get_context() const
+{
+  if (!is_active())
+  {
+    NVBENCH_THROW(std::runtime_error,
+                  "{}",
+                  "get_context is called for inactive device");
+  }
+
+  CUcontext cu_context;
+  NVBENCH_DRIVER_API_CALL(cuCtxGetCurrent(&cu_context));
+  return cu_context;
+}
+#endif
+
 } // namespace nvbench
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -18,12 +18,14 @@

 #pragma once

+#include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>

 #include <cuda_runtime_api.h>

 #include <cstdint> // CHAR_BIT
+#include <stdexcept>
 #include <string_view>
 #include <utility>

@@ -67,6 +69,13 @@ struct device_info
  void set_active() const
  {
    NVBENCH_CUDA_CALL(cudaSetDevice(m_id));
+
+#ifdef NVBENCH_HAS_CUPTI
+    // cudaSetDevice doesn't initialize a context on the first call, so we have
+    // to force it. According to the documentation, if devPtr is 0, no
+    // operation is performed.
+    NVBENCH_CUDA_CALL(cudaFree(nullptr));
+#endif
  }

  /// Enable or disable persistence mode.
@@ -200,6 +209,20 @@ struct device_info
  /// @return True if ECC is enabled on this device.
  [[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; }

+  /// @return True if CUPTI supports this device.
+  [[nodiscard]] bool is_cupti_supported() const
+  {
+#ifdef NVBENCH_HAS_CUPTI
+    return m_prop.major >= 7;
+#else
+    return false;
+#endif
+  }
+
+#ifdef NVBENCH_HAS_CUPTI
+  [[nodiscard]] CUcontext get_context() const;
+#endif
+
  /// @return A cached copy of the device's cudaDeviceProp.
  [[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const
  {
--- a/nvbench/main.cuh
+++ b/nvbench/main.cuh
@@ -20,6 +20,7 @@

 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/benchmark_manager.cuh>
+#include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/option_parser.cuh>
 #include <nvbench/printer_base.cuh>
@@ -45,9 +46,16 @@
    return 1;                                                                  \
  }

+#ifdef NVBENCH_HAS_CUPTI
+#define NVBENCH_INITIALIZE_DRIVER_API NVBENCH_DRIVER_API_CALL(cuInit(0));
+#else
+#define NVBENCH_INITIALIZE_DRIVER_API
+#endif
+
 #define NVBENCH_MAIN_BODY(argc, argv)                                          \
  do                                                                           \
  {                                                                            \
+    NVBENCH_INITIALIZE_DRIVER_API                                              \
    nvbench::option_parser parser;                                             \
    parser.parse(argc, argv);                                                  \
    auto &printer = parser.get_printer();                                      \
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -221,6 +221,51 @@ struct state
    return m_benchmark;
  }

+  void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; }
+  void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; }
+  void collect_stores_efficiency() { m_collect_stores_efficiency = true; }
+  void collect_loads_efficiency() { m_collect_loads_efficiency = true; }
+  void collect_dram_throughput() { m_collect_dram_throughput = true; }
+
+  void collect_cupti_metrics()
+  {
+    collect_l1_hit_rates();
+    collect_l2_hit_rates();
+    collect_stores_efficiency();
+    collect_loads_efficiency();
+    collect_dram_throughput();
+  }
+
+  [[nodiscard]] bool is_l1_hit_rate_collected() const
+  {
+    return m_collect_l1_hit_rates;
+  }
+  [[nodiscard]] bool is_l2_hit_rate_collected() const
+  {
+    return m_collect_l2_hit_rates;
+  }
+  [[nodiscard]] bool is_stores_efficiency_collected() const
+  {
+    return m_collect_stores_efficiency;
+  }
+  [[nodiscard]] bool is_loads_efficiency_collected() const
+  {
+    return m_collect_loads_efficiency;
+  }
+  [[nodiscard]] bool is_dram_throughput_collected() const
+  {
+    return m_collect_dram_throughput;
+  }
+
+  [[nodiscard]] bool is_cupti_required() const
+  {
+    return is_l2_hit_rate_collected()
+        || is_l1_hit_rate_collected()
+        || is_stores_efficiency_collected()
+        || is_loads_efficiency_collected()
+        || is_dram_throughput_collected();
+  }
+
  summary &add_summary(std::string summary_name);
  summary &add_summary(summary s);
  [[nodiscard]] const summary &get_summary(std::string_view name) const;
@@ -279,6 +324,12 @@ private:
  std::string m_skip_reason;
  std::size_t m_element_count{};
  std::size_t m_global_memory_rw_bytes{};
+
+  bool m_collect_l1_hit_rates{};
+  bool m_collect_l2_hit_rates{};
+  bool m_collect_stores_efficiency{};
+  bool m_collect_loads_efficiency{};
+  bool m_collect_dram_throughput{};
 };

 } // namespace nvbench
--- a/testing/cmake/test_export/CMakeLists.txt
+++ b/testing/cmake/test_export/CMakeLists.txt
@@ -7,4 +7,12 @@ find_package(NVBench)
 add_executable(test_bench test_bench.cu)
 target_link_libraries(test_bench PRIVATE nvbench::main)
 enable_testing()
-add_test(NAME test COMMAND "$<TARGET_FILE:test_bench>" --timeout 1)
+add_test(NAME test_bench COMMAND "$<TARGET_FILE:test_bench>" --timeout 1)
+
+# Need to add the CUPTI path to LD_LIBRARY_PATH to make sure CUPTI libraries
+# are found at runtime:
+if (UNIX AND TARGET nvbench::cupti)
+  get_property(cupti_lib_path TARGET nvbench::cupti PROPERTY IMPORTED_LOCATION)
+  cmake_path(GET cupti_lib_path PARENT_PATH cupti_lib_path)
+  set_property(TEST test_bench PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${cupti_lib_path}")
+endif()