CUPTI support

This commit is contained in:
Georgy Evtushenko
2021-09-01 19:43:12 +03:00
parent 3d6c16f8ba
commit 1bc715267c
26 changed files with 1705 additions and 4 deletions

2
.gitignore vendored
View File

@@ -1 +1,3 @@
build/
.idea
cmake-build-*

View File

@@ -17,6 +17,7 @@ project(NVBench
nvbench_init_rapids_cmake()
option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ON)
option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)

71
cmake/NVBenchCUPTI.cmake Normal file
View File

@@ -0,0 +1,71 @@
# Since this file is installed, we need to make sure that the CUDAToolkit has
# been found by consumers:
if (NOT TARGET CUDA::toolkit)
find_package(CUDAToolkit REQUIRED)
endif()
set(nvbench_cupti_root "${CUDAToolkit_LIBRARY_ROOT}/extras/CUPTI")
# The CUPTI targets in FindCUDAToolkit are broken:
# - The dll locations are not specified
# - Dependent libraries nvperf_* are not linked.
# So we create our own targets:
function(nvbench_add_cupti_dep dep_name)
string(TOLOWER ${dep_name} dep_name_lower)
string(TOUPPER ${dep_name} dep_name_upper)
add_library(nvbench::${dep_name_lower} SHARED IMPORTED)
if (WIN32)
# Attempt to locate the dll in the expected location. This is necessary
# because the CUPTI dll has a versioned suffix, so we can't directly search
# for it with find_file.
file(GLOB dep_dll_path "${nvbench_cupti_root}/lib64/${dep_name_lower}*dll")
cmake_path(GET dep_dll_path FILENAME dep_dll_filename)
# If the dll was not found in the expected location, use a default filename as a user hint.
if (NOT dep_dll_filename)
set(dep_dll_filename ${dep_name_lower}.dll)
endif()
# Use find_file to create a cache variable and mark the file as REQUIRED.
find_file(NVBench_${dep_name_upper}_DLL ${dep_dll_filename} REQUIRED
DOC "The full path to ${dep_name_lower}.dll from the CUDA Toolkit."
HINTS "${nvbench_cupti_root}/lib64/"
)
mark_as_advanced(NVBench_${dep_name_upper}_DLL)
# The .libs don't have suffixes, so we can just directly search for them.
find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower}.lib REQUIRED
DOC "The full path to ${dep_name_lower}.lib from the CUDA Toolkit."
HINTS "${nvbench_cupti_root}/lib64/"
)
mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
set_target_properties(nvbench::${dep_name_lower} PROPERTIES
IMPORTED_LOCATION "${NVBench_${dep_name_upper}_DLL}"
IMPORTED_IMPLIB "${NVBench_${dep_name_upper}_LIBRARY}"
)
else()
find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED
DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit."
HINTS "${nvbench_cupti_root}/lib64"
)
mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
set_target_properties(nvbench::${dep_name_lower} PROPERTIES
IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}"
)
endif()
endfunction()
nvbench_add_cupti_dep(nvperf_target)
nvbench_add_cupti_dep(nvperf_host)
nvbench_add_cupti_dep(cupti)
target_link_libraries(nvbench::cupti INTERFACE
nvbench::nvperf_target
nvbench::nvperf_host
)
target_include_directories(nvbench::cupti INTERFACE
"${nvbench_cupti_root}/include"
)

View File

@@ -64,3 +64,10 @@ if (NVBench_ENABLE_NVML)
include("${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake")
list(APPEND ctk_libraries nvbench::nvml)
endif()
################################################################################
# CUDAToolkit -> CUPTI
if (NVBench_ENABLE_CUPTI)
include("${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake")
list(APPEND ctk_libraries CUDA::cuda_driver nvbench::cupti)
endif()

View File

@@ -23,7 +23,9 @@ endif()
function(nvbench_setup_dep_dlls target_name)
# The custom command below fails when there aren't any runtime DLLs to copy,
# so only enable it when a relevant dependency is enabled:
if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND NVBench_ENABLE_NVML)
if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND
(NVBench_ENABLE_NVML OR
NVBench_ENABLE_CUPTI))
add_custom_command(TARGET ${target_name}
POST_BUILD
COMMAND

View File

@@ -11,6 +11,15 @@ macro(nvbench_generate_exports)
)
endif()
if (NVBench_ENABLE_CUPTI)
string(APPEND nvbench_build_export_code_block
"include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n"
)
string(APPEND nvbench_install_export_code_block
"include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n"
)
endif()
rapids_export(BUILD NVBench
EXPORT_SET nvbench-targets
NAMESPACE "nvbench::"

View File

@@ -39,6 +39,14 @@ if (NVBench_ENABLE_NVML)
)
endif()
if (NVBench_ENABLE_CUPTI)
install(
FILES
"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake"
DESTINATION "${config_install_location}"
)
endif()
# Call with a list of library targets to generate install rules:
function(nvbench_install_libraries)
install(TARGETS ${ARGN}

View File

@@ -3,5 +3,9 @@ function(nvbench_write_config_header filepath)
set(NVBENCH_HAS_NVML 1)
endif()
if (NVBench_ENABLE_CUPTI)
set(NVBENCH_HAS_CUPTI 1)
endif()
configure_file("${NVBench_SOURCE_DIR}/cmake/config.cuh.in" "${filepath}")
endfunction()

View File

@@ -20,3 +20,6 @@
// Defined if NVBench has been built with NVML support.
#cmakedefine NVBENCH_HAS_NVML
// Defined if NVBench has been built with CUPTI support.
#cmakedefine NVBENCH_HAS_CUPTI

View File

@@ -5,6 +5,7 @@ set(example_srcs
exec_tag_timer.cu
skip.cu
throughput.cu
auto_throughput.cu
)
# Metatarget for all examples:

View File

@@ -0,0 +1,87 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>
template <int ItemsPerThread>
__global__ void kernel(std::size_t stride,
std::size_t elements,
const nvbench::int32_t * __restrict__ in,
nvbench::int32_t *__restrict__ out)
{
const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const std::size_t step = gridDim.x * blockDim.x;
for (std::size_t i = stride * tid;
i < stride * elements;
i += stride * step)
{
for (int j = 0; j < ItemsPerThread; j++)
{
const auto read_id = (ItemsPerThread * i + j) % elements;
const auto write_id = tid + j * elements;
out[write_id] = in[read_id];
}
}
}
// `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
// and cache hit rates.
//
// Calling state.collect_*() enables particular metric collection if nvbench
// was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
template <int ItemsPerThread>
void throughput_bench(nvbench::state &state,
nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
{
// Allocate input data:
const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(elements);
thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
// Provide throughput information:
state.add_element_count(elements, "Elements");
state.collect_dram_throughput();
state.collect_l1_hit_rates();
state.collect_l2_hit_rates();
state.collect_loads_efficiency();
state.collect_stores_efficiency();
const auto threads_in_block = 256;
const auto blocks_in_grid = (elements + threads_in_block - 1) /
threads_in_block;
state.exec([&](nvbench::launch &launch) {
kernel<ItemsPerThread>
<<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
stride,
elements,
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()));
});
}
using items_per_thread = nvbench::enum_type_list<1, 2>;
NVBENCH_BENCH_TYPES(throughput_bench, NVBENCH_TYPE_AXES(items_per_thread))
.add_int64_axis("Stride", nvbench::range(1, 4, 3));

View File

@@ -26,6 +26,10 @@ set(srcs
detail/state_generator.cxx
)
if (NVBench_ENABLE_CUPTI)
list(APPEND srcs detail/measure_cupti.cu cupti_profiler.cxx)
endif()
if (NVBench_ENABLE_NVML)
list(APPEND srcs internal/nvml.cxx)
endif()

View File

@@ -17,6 +17,7 @@
*/
#include <nvbench/cuda_call.cuh>
#include <nvbench/config.cuh>
#include <fmt/format.h>
@@ -44,6 +45,34 @@ void throw_error(const std::string &filename,
command));
}
#ifdef NVBENCH_HAS_CUPTI
void throw_error(const std::string &filename,
std::size_t lineno,
const std::string &command,
CUresult error_code)
{
const char *name = nullptr;
cuGetErrorName(error_code, &name);
const char *string = nullptr;
cuGetErrorString(error_code, &string);
throw std::runtime_error(fmt::format("{}:{}: Driver API call returned error: "
"{}: {}\nCommand: '{}'",
filename,
lineno,
name,
string,
command));
}
#else
void throw_error(const std::string &,
std::size_t,
const std::string &,
CUresult)
{}
#endif
void exit_error(const std::string &filename,
std::size_t lineno,
const std::string &command,

View File

@@ -19,6 +19,7 @@
#pragma once
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <string>
@@ -36,6 +37,20 @@
} \
} while (false)
/// Throws a std::runtime_error if `call` doesn't return `CUDA_SUCCESS`.
#define NVBENCH_DRIVER_API_CALL(call) \
do \
{ \
const CUresult nvbench_cuda_call_error = call; \
if (nvbench_cuda_call_error != CUDA_SUCCESS) \
{ \
nvbench::cuda_call::throw_error(__FILE__, \
__LINE__, \
#call, \
nvbench_cuda_call_error); \
} \
} while (false)
/// Terminates process with failure status if `call` doesn't return
/// `cudaSuccess`.
#define NVBENCH_CUDA_CALL_NOEXCEPT(call) \
@@ -59,6 +74,11 @@ void throw_error(const std::string &filename,
const std::string &call,
cudaError_t error);
void throw_error(const std::string &filename,
std::size_t lineno,
const std::string &call,
CUresult error);
void exit_error(const std::string &filename,
std::size_t lineno,
const std::string &command,

129
nvbench/cupti_profiler.cuh Normal file
View File

@@ -0,0 +1,129 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nvbench/config.cuh>
#include <nvbench/device_info.cuh>
#include <string>
#include <vector>
#include <optional>
namespace nvbench::detail
{
#ifdef NVBENCH_HAS_CUPTI
/**
* Pass required metrics in the constructor and organize your code as follows
* to get counters back:
*
* ```
* cupti_profiler cupti(
* nvbench::device_info{0},
* { "l1tex__t_sector_hit_rate.pct" });
*
* cupti->prepare_user_loop();
*
* do
* {
* m_measure.m_cupti->start_user_loop();
*
* kernel_1<<<1, 1>>>();
* // ...
* kernel_n<<<1, 1>>>();
*
* m_measure.m_cupti->stop_user_loop();
* } while(cupti->is_replay_required());
*
* cupti->process_user_loop();
*
* auto result = m_cupti->get_counter_values();
* ```
*
* Check Perfworks Metric table here for the full list of metrics:
* https://docs.nvidia.com/cupti/r_main.html#metrics-reference-7x
*/
class cupti_profiler
{
bool m_available {};
std::string m_chip_name;
// Counter data
std::vector<std::string> m_metric_names;
std::vector<std::uint8_t> m_data_image_prefix;
std::vector<std::uint8_t> m_config_image;
std::vector<std::uint8_t> m_data_image;
std::vector<std::uint8_t> m_data_scratch_buffer;
std::vector<std::uint8_t> m_availability_image;
nvbench::device_info m_device;
// CUPTI runs a series of replay passes, where each pass contains a sequence
// of ranges. Every metric enabled in the configuration is collected
// separately per unique range in the pass. CUPTI supports auto and
// user-defined ranges. With auto range mode, ranges are defined around each
// kernel automatically. In the user range mode, ranges are defined manually.
// We define a single user range for the whole measurement.
static const int m_num_ranges = 1;
public:
// Move only
cupti_profiler(cupti_profiler &&) noexcept;
cupti_profiler &operator=(cupti_profiler &&) noexcept;
cupti_profiler(const cupti_profiler &) = delete;
cupti_profiler &operator=(const cupti_profiler &) = delete;
cupti_profiler(nvbench::device_info device,
std::vector<std::string> &&metric_names);
~cupti_profiler();
[[nodiscard]] bool is_initialized() const;
/// Should be called before replay loop
void prepare_user_loop();
/// Should be called before any kernel calls in the replay loop
void start_user_loop();
/// Should be called after all kernel calls in the replay loop
void stop_user_loop();
/// Should be called after the replay loop
void process_user_loop();
/// Indicates whether another iteration of the replay loop is required
[[nodiscard]] bool is_replay_required();
/// Returns counters for metrics requested in the constructor
[[nodiscard]] std::vector<double> get_counter_values();
private:
void initialize_profiler();
void initialize_chip_name();
void initialize_availability_image();
static void initialize_nvpw();
void initialize_config_image();
void initialize_counter_data_prefix_image();
void initialize_counter_data_image();
};
#endif
} // namespace nvbench::detail

775
nvbench/cupti_profiler.cxx Normal file
View File

@@ -0,0 +1,775 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cupti_profiler_target.h>
#include <cupti_target.h>
#include <nvbench/cupti_profiler.cuh>
#include <nvbench/device_info.cuh>
#include <nvperf_cuda_host.h>
#include <nvperf_host.h>
#include <nvperf_target.h>
#include <fmt/format.h>
#include <stdexcept>
namespace nvbench::detail
{
namespace
{
void cupti_call(const CUptiResult status)
{
if (status != CUPTI_SUCCESS)
{
const char *errstr{};
cuptiGetResultString(status, &errstr);
throw std::runtime_error(
fmt::format("CUPTI call returned error: {}\n", errstr));
}
}
void nvpw_call(const NVPA_Status status)
{
if (status != NVPA_STATUS_SUCCESS)
{
throw std::runtime_error(
fmt::format("NVPW call returned error: {}\n", status));
}
}
} // namespace
cupti_profiler::cupti_profiler(nvbench::device_info device,
std::vector<std::string> &&metric_names)
: m_metric_names(metric_names)
, m_device(device)
{
initialize_profiler();
initialize_chip_name();
initialize_availability_image();
initialize_nvpw();
initialize_config_image();
initialize_counter_data_prefix_image();
initialize_counter_data_image();
m_available = true;
}
cupti_profiler::cupti_profiler(cupti_profiler &&rhs) noexcept
: m_device(rhs.m_device.get_id(), rhs.m_device.get_cuda_device_prop())
{
(*this) = std::move(rhs);
}
cupti_profiler &cupti_profiler::operator=(cupti_profiler &&rhs) noexcept
{
m_device = rhs.m_device;
m_available = rhs.m_available;
m_chip_name = std::move(rhs.m_chip_name);
m_metric_names = std::move(rhs.m_metric_names);
m_data_image_prefix = std::move(rhs.m_data_image_prefix);
m_config_image = std::move(rhs.m_config_image);
m_data_image = std::move(rhs.m_data_image);
m_data_scratch_buffer = std::move(rhs.m_data_scratch_buffer);
m_availability_image = std::move(rhs.m_availability_image);
rhs.m_available = false;
return *this;
}
void cupti_profiler::initialize_profiler()
{
if (!m_device.is_cupti_supported())
{
throw std::runtime_error(fmt::format("Device: {} isn't supported (CC {})",
m_device.get_id(),
m_device.get_sm_version()));
}
CUpti_Profiler_Initialize_Params params = {
CUpti_Profiler_Initialize_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerInitialize(&params));
}
void cupti_profiler::initialize_chip_name()
{
CUpti_Device_GetChipName_Params params = {
CUpti_Device_GetChipName_Params_STRUCT_SIZE};
params.deviceIndex = m_device.get_id();
cupti_call(cuptiDeviceGetChipName(&params));
m_chip_name = std::string(params.pChipName);
}
void cupti_profiler::initialize_availability_image()
{
CUpti_Profiler_GetCounterAvailability_Params params = {
CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE};
params.ctx = m_device.get_context();
cupti_call(cuptiProfilerGetCounterAvailability(&params));
m_availability_image.clear();
m_availability_image.resize(params.counterAvailabilityImageSize);
params.pCounterAvailabilityImage = m_availability_image.data();
cupti_call(cuptiProfilerGetCounterAvailability(&params));
}
void cupti_profiler::initialize_nvpw()
{
NVPW_InitializeHost_Params params = {NVPW_InitializeHost_Params_STRUCT_SIZE};
nvpw_call(NVPW_InitializeHost(&params));
}
namespace
{
class eval_request
{
NVPW_MetricsEvaluator *evaluator_ptr;
public:
eval_request(NVPW_MetricsEvaluator *evaluator_ptr,
const std::string &metric_name)
: evaluator_ptr(evaluator_ptr)
{
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params = {
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE};
params.pMetricsEvaluator = evaluator_ptr;
params.pMetricName = metric_name.c_str();
params.pMetricEvalRequest = &request;
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
nvpw_call(
NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
}
[[nodiscard]] std::vector<const char *> get_raw_dependencies()
{
std::vector<const char *> raw_dependencies;
NVPW_MetricsEvaluator_GetMetricRawDependencies_Params params = {
NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE};
params.pMetricsEvaluator = evaluator_ptr;
params.pMetricEvalRequests = &request;
params.numMetricEvalRequests = 1;
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
nvpw_call(NVPW_MetricsEvaluator_GetMetricRawDependencies(&params));
raw_dependencies.resize(params.numRawDependencies);
params.ppRawDependencies = raw_dependencies.data();
nvpw_call(NVPW_MetricsEvaluator_GetMetricRawDependencies(&params));
return raw_dependencies;
}
NVPW_MetricEvalRequest request;
};
class metric_evaluator
{
bool initialized{};
NVPW_MetricsEvaluator *evaluator_ptr;
std::vector<std::uint8_t> scratch_buffer;
public:
metric_evaluator(const std::string &chip_name,
const std::uint8_t *counter_availability_image = nullptr,
const std::uint8_t *counter_data_image = nullptr,
const std::size_t counter_data_image_size = 0)
{
NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
scratch_buffer_param = {
NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE};
scratch_buffer_param.pChipName = chip_name.c_str();
scratch_buffer_param.pCounterAvailabilityImage = counter_availability_image;
nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(
&scratch_buffer_param));
scratch_buffer.resize(scratch_buffer_param.scratchBufferSize);
NVPW_CUDA_MetricsEvaluator_Initialize_Params evaluator_params = {
NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE};
evaluator_params.scratchBufferSize = scratch_buffer.size();
evaluator_params.pScratchBuffer = scratch_buffer.data();
evaluator_params.pChipName = chip_name.c_str();
evaluator_params.pCounterAvailabilityImage = counter_availability_image;
evaluator_params.pCounterDataImage = counter_data_image;
evaluator_params.counterDataImageSize = counter_data_image_size;
nvpw_call(NVPW_CUDA_MetricsEvaluator_Initialize(&evaluator_params));
evaluator_ptr = evaluator_params.pMetricsEvaluator;
initialized = true;
}
~metric_evaluator()
{
if (initialized)
{
NVPW_MetricsEvaluator_Destroy_Params params = {
NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE};
params.pMetricsEvaluator = evaluator_ptr;
nvpw_call(NVPW_MetricsEvaluator_Destroy(&params));
}
}
[[nodiscard]] eval_request create_request(const std::string &metric_name)
{
return {evaluator_ptr, metric_name};
}
[[nodiscard]] operator NVPW_MetricsEvaluator *() const
{
return evaluator_ptr;
}
};
} // namespace
namespace
{
[[nodiscard]] std::vector<NVPA_RawMetricRequest> get_raw_metric_requests(
const std::string &chip_name,
const std::vector<std::string> &metric_names,
const std::uint8_t *counter_availability_image = nullptr)
{
metric_evaluator evaluator(chip_name, counter_availability_image);
std::vector<const char *> raw_metric_names;
raw_metric_names.reserve(metric_names.size());
for (auto &metric_name : metric_names)
{
for (auto &raw_dependency :
evaluator.create_request(metric_name).get_raw_dependencies())
{
raw_metric_names.push_back(raw_dependency);
}
}
std::vector<NVPA_RawMetricRequest> raw_requests;
raw_requests.reserve(raw_metric_names.size());
for (auto &raw_name : raw_metric_names)
{
NVPA_RawMetricRequest metricRequest = {NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE};
metricRequest.pMetricName = raw_name;
metricRequest.isolated = true;
metricRequest.keepInstances = true;
raw_requests.push_back(metricRequest);
}
return raw_requests;
}
class metrics_config
{
bool initialized{};
void create(const std::string &chip_name,
const std::uint8_t *availability_image)
{
NVPW_CUDA_RawMetricsConfig_Create_V2_Params params = {
NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE};
params.activityKind = NVPA_ACTIVITY_KIND_PROFILER;
params.pChipName = chip_name.c_str();
params.pCounterAvailabilityImage = availability_image;
nvpw_call(NVPW_CUDA_RawMetricsConfig_Create_V2(&params));
raw_metrics_config = params.pRawMetricsConfig;
initialized = true;
}
void set_availability_image(const std::uint8_t *availability_image)
{
NVPW_RawMetricsConfig_SetCounterAvailability_Params params = {
NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE};
params.pRawMetricsConfig = raw_metrics_config;
params.pCounterAvailabilityImage = availability_image;
nvpw_call(NVPW_RawMetricsConfig_SetCounterAvailability(&params));
}
void begin_config_group()
{
NVPW_RawMetricsConfig_BeginPassGroup_Params params = {
NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE};
params.pRawMetricsConfig = raw_metrics_config;
nvpw_call(NVPW_RawMetricsConfig_BeginPassGroup(&params));
}
void add_metrics(const std::vector<NVPA_RawMetricRequest> &raw_metric_requests)
{
NVPW_RawMetricsConfig_AddMetrics_Params params = {
NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE};
params.pRawMetricsConfig = raw_metrics_config;
params.pRawMetricRequests = raw_metric_requests.data();
params.numMetricRequests = raw_metric_requests.size();
nvpw_call(NVPW_RawMetricsConfig_AddMetrics(&params));
}
void end_config_group()
{
NVPW_RawMetricsConfig_EndPassGroup_Params params = {
NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE};
params.pRawMetricsConfig = raw_metrics_config;
nvpw_call(NVPW_RawMetricsConfig_EndPassGroup(&params));
}
void generate()
{
NVPW_RawMetricsConfig_GenerateConfigImage_Params params = {
NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE};
params.pRawMetricsConfig = raw_metrics_config;
nvpw_call(NVPW_RawMetricsConfig_GenerateConfigImage(&params));
}
public:
metrics_config(const std::string &chip_name,
const std::vector<NVPA_RawMetricRequest> &raw_metric_requests,
const std::uint8_t *availability_image)
{
create(chip_name, availability_image);
set_availability_image(availability_image);
begin_config_group();
add_metrics(raw_metric_requests);
end_config_group();
generate();
}
[[nodiscard]] std::vector<std::uint8_t> get_config_image()
{
NVPW_RawMetricsConfig_GetConfigImage_Params params = {
NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE};
params.pRawMetricsConfig = raw_metrics_config;
params.bytesAllocated = 0;
params.pBuffer = nullptr;
nvpw_call(NVPW_RawMetricsConfig_GetConfigImage(&params));
std::vector<std::uint8_t> config_image(params.bytesCopied);
params.bytesAllocated = config_image.size();
params.pBuffer = config_image.data();
nvpw_call(NVPW_RawMetricsConfig_GetConfigImage(&params));
return config_image;
}
~metrics_config()
{
if (initialized)
{
NVPW_RawMetricsConfig_Destroy_Params params = {
NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE};
params.pRawMetricsConfig = raw_metrics_config;
NVPW_RawMetricsConfig_Destroy(&params);
}
}
NVPA_RawMetricsConfig *raw_metrics_config;
};
} // namespace
void cupti_profiler::initialize_config_image()
{
m_config_image =
metrics_config(m_chip_name,
get_raw_metric_requests(m_chip_name,
m_metric_names,
m_availability_image.data()),
m_availability_image.data())
.get_config_image();
}
namespace
{
class counter_data_builder
{
bool initialized{};
public:
counter_data_builder(const std::string &chip_name,
const std::uint8_t *pCounterAvailabilityImage)
{
NVPW_CUDA_CounterDataBuilder_Create_Params params = {
NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE};
params.pChipName = chip_name.c_str();
params.pCounterAvailabilityImage = pCounterAvailabilityImage;
nvpw_call(NVPW_CUDA_CounterDataBuilder_Create(&params));
builder = params.pCounterDataBuilder;
initialized = true;
}
~counter_data_builder()
{
if (initialized)
{
NVPW_CounterDataBuilder_Destroy_Params params = {
NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE};
params.pCounterDataBuilder = builder;
NVPW_CounterDataBuilder_Destroy(&params);
}
}
NVPA_CounterDataBuilder *builder;
};
} // namespace
void cupti_profiler::initialize_counter_data_prefix_image()
{
const std::uint8_t *counter_availability_image = nullptr;
std::vector<NVPA_RawMetricRequest> raw_metric_requests =
get_raw_metric_requests(m_chip_name,
m_metric_names,
counter_availability_image);
counter_data_builder data_builder(m_chip_name, counter_availability_image);
{
NVPW_CounterDataBuilder_AddMetrics_Params params = {
NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE};
params.pCounterDataBuilder = data_builder.builder;
params.pRawMetricRequests = raw_metric_requests.data();
params.numMetricRequests = raw_metric_requests.size();
nvpw_call(NVPW_CounterDataBuilder_AddMetrics(&params));
}
{
NVPW_CounterDataBuilder_GetCounterDataPrefix_Params params = {
NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE};
params.pCounterDataBuilder = data_builder.builder;
params.bytesAllocated = 0;
params.pBuffer = nullptr;
nvpw_call(NVPW_CounterDataBuilder_GetCounterDataPrefix(&params));
m_data_image_prefix.resize(params.bytesCopied);
params.bytesAllocated = m_data_image_prefix.size();
params.pBuffer = m_data_image_prefix.data();
nvpw_call(NVPW_CounterDataBuilder_GetCounterDataPrefix(&params));
}
}
namespace
{
[[nodiscard]] std::size_t
get_counter_data_image_size(CUpti_Profiler_CounterDataImageOptions *options)
{
CUpti_Profiler_CounterDataImage_CalculateSize_Params params = {
CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE};
params.pOptions = options;
params.sizeofCounterDataImageOptions =
CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
cupti_call(cuptiProfilerCounterDataImageCalculateSize(&params));
return params.counterDataImageSize;
}
} // namespace
void cupti_profiler::initialize_counter_data_image()
{
CUpti_Profiler_CounterDataImageOptions counter_data_image_options;
counter_data_image_options.pCounterDataPrefix = &m_data_image_prefix[0];
counter_data_image_options.counterDataPrefixSize = m_data_image_prefix.size();
counter_data_image_options.maxNumRanges = m_num_ranges;
counter_data_image_options.maxNumRangeTreeNodes = m_num_ranges;
counter_data_image_options.maxRangeNameLength = 64;
m_data_image.resize(get_counter_data_image_size(&counter_data_image_options));
{
CUpti_Profiler_CounterDataImage_Initialize_Params params = {
CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE};
params.sizeofCounterDataImageOptions =
CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
params.pOptions = &counter_data_image_options;
params.counterDataImageSize = m_data_image.size();
params.pCounterDataImage = &m_data_image[0];
cupti_call(cuptiProfilerCounterDataImageInitialize(&params));
}
{
CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params params = {
CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE};
params.counterDataImageSize = m_data_image.size();
params.pCounterDataImage = &m_data_image[0];
cupti_call(
cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
m_data_scratch_buffer.resize(params.counterDataScratchBufferSize);
}
{
CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params params = {
CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE};
params.counterDataImageSize = m_data_image.size();
params.pCounterDataImage = &m_data_image[0];
params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
params.pCounterDataScratchBuffer = &m_data_scratch_buffer[0];
cupti_call(cuptiProfilerCounterDataImageInitializeScratchBuffer(&params));
}
}
cupti_profiler::~cupti_profiler()
{
if (is_initialized())
{
CUpti_Profiler_DeInitialize_Params params = {
CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE};
cuptiProfilerDeInitialize(&params);
}
}
bool cupti_profiler::is_initialized() const
{
return m_available;
}
void cupti_profiler::prepare_user_loop()
{
{
CUpti_Profiler_BeginSession_Params params = {
CUpti_Profiler_BeginSession_Params_STRUCT_SIZE};
params.ctx = nullptr;
params.counterDataImageSize = m_data_image.size();
params.pCounterDataImage = &m_data_image[0];
params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
params.pCounterDataScratchBuffer = &m_data_scratch_buffer[0];
// Each kernel is going to produce its own set of metrics
params.range = CUPTI_UserRange;
params.replayMode = CUPTI_UserReplay;
params.maxRangesPerPass = m_num_ranges;
params.maxLaunchesPerPass = m_num_ranges;
cupti_call(cuptiProfilerBeginSession(&params));
}
{
CUpti_Profiler_SetConfig_Params params = {
CUpti_Profiler_SetConfig_Params_STRUCT_SIZE};
params.pConfig = &m_config_image[0];
params.configSize = m_config_image.size();
params.minNestingLevel = 1;
params.numNestingLevels = 1;
params.passIndex = 0;
cupti_call(cuptiProfilerSetConfig(&params));
}
}
void cupti_profiler::start_user_loop()
{
{
CUpti_Profiler_BeginPass_Params params = {
CUpti_Profiler_BeginPass_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerBeginPass(&params));
}
{
CUpti_Profiler_EnableProfiling_Params params = {
CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerEnableProfiling(&params));
}
{
CUpti_Profiler_PushRange_Params params = {
CUpti_Profiler_PushRange_Params_STRUCT_SIZE};
std::string rangeName = "nvbench";
params.pRangeName = rangeName.c_str();
cupti_call(cuptiProfilerPushRange(&params));
}
}
void cupti_profiler::stop_user_loop()
{
{
CUpti_Profiler_PopRange_Params params = {
CUpti_Profiler_PopRange_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerPopRange(&params));
}
{
CUpti_Profiler_DisableProfiling_Params params = {
CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerDisableProfiling(&params));
}
}
bool cupti_profiler::is_replay_required()
{
CUpti_Profiler_EndPass_Params params = {
CUpti_Profiler_EndPass_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerEndPass(&params));
return !params.allPassesSubmitted;
}
void cupti_profiler::process_user_loop()
{
{
CUpti_Profiler_FlushCounterData_Params params = {
CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerFlushCounterData(&params));
}
{
CUpti_Profiler_UnsetConfig_Params params = {
CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerUnsetConfig(&params));
}
{
CUpti_Profiler_EndSession_Params params = {
CUpti_Profiler_EndSession_Params_STRUCT_SIZE};
cupti_call(cuptiProfilerEndSession(&params));
}
}
std::vector<double> cupti_profiler::get_counter_values()
{
metric_evaluator evaluator(m_chip_name,
m_availability_image.data(),
m_data_image.data(),
m_data_image.size());
{
NVPW_CounterData_GetNumRanges_Params params = {
NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE};
params.pCounterDataImage = m_data_image.data();
nvpw_call(NVPW_CounterData_GetNumRanges(&params));
if (params.numRanges != 1)
{
throw std::runtime_error("Something's gone wrong, one range is expected");
}
}
std::size_t range_id{}; // there's only one range
std::size_t result_id{};
std::vector<double> result(m_metric_names.size());
for (const std::string &metric_name : m_metric_names)
{
eval_request request = evaluator.create_request(metric_name);
{
NVPW_MetricsEvaluator_SetDeviceAttributes_Params params = {
NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE};
params.pMetricsEvaluator = evaluator;
params.pCounterDataImage = m_data_image.data();
params.counterDataImageSize = m_data_image.size();
nvpw_call(NVPW_MetricsEvaluator_SetDeviceAttributes(&params));
}
{
NVPW_MetricsEvaluator_EvaluateToGpuValues_Params params = {
NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE};
params.pMetricsEvaluator = evaluator;
params.pMetricEvalRequests = &request.request;
params.numMetricEvalRequests = 1;
params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
params.pCounterDataImage = m_data_image.data();
params.counterDataImageSize = m_data_image.size();
params.rangeIndex = range_id;
params.isolated = true;
params.pMetricValues = &result[result_id++];
nvpw_call(NVPW_MetricsEvaluator_EvaluateToGpuValues(&params));
}
}
return result;
}
} // namespace nvbench::detail

View File

@@ -42,7 +42,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
, m_min_time{exec_state.get_min_time()}
, m_skip_time{exec_state.get_skip_time()}
, m_timeout{exec_state.get_timeout()}
{}
{ }
void measure_cold_base::check()
{

View File

@@ -185,6 +185,7 @@ private:
}
kernel_launch_timer<use_blocking_kernel> timer(*this);
this->launch_kernel(timer);
this->check_skip_time(m_cuda_timer.get_duration());
}

View File

@@ -0,0 +1,263 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/benchmark_base.cuh>
#include <nvbench/detail/measure_cupti.cuh>
#include <nvbench/detail/throw.cuh>
#include <nvbench/printer_base.cuh>
#include <nvbench/state.cuh>
#include <fmt/format.h>
#include <algorithm>
#include <stdexcept>
namespace nvbench::detail
{
namespace
{
enum class metric_id : int
{
dram_peak_sustained_throughput = 0,
global_load_efficiency,
global_store_efficiency,
l1_hit_rate,
l2_hit_rate,
count
};
template <metric_id id>
struct metric_traits;
template <>
struct metric_traits<metric_id::dram_peak_sustained_throughput>
{
static constexpr const char *metric_name =
"dram__throughput.avg.pct_of_peak_sustained_elapsed";
static constexpr const char *summary =
"Peak Sustained Global Memory Throughput (HW)";
static constexpr const char *hint = "percentage";
static constexpr const char *short_name = "HBWPeak";
static constexpr const char *description =
"The utilization level of the device memory relative to the peak "
"utilization.";
static constexpr double divider = 100.0;
static bool is_collected(nvbench::state &m_state)
{
return m_state.is_dram_throughput_collected();
};
};
template <>
struct metric_traits<metric_id::global_load_efficiency>
{
static constexpr const char *metric_name =
"smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct";
static constexpr const char *summary = "Global Load Efficiency (HW)";
static constexpr const char *hint = "percentage";
static constexpr const char *short_name = "LoadEff";
static constexpr const char *description =
"Ratio of requested global memory load throughput to required global "
"memory load throughput expressed as percentage.";
static constexpr double divider = 100.0;
static bool is_collected(nvbench::state &m_state)
{
return m_state.is_loads_efficiency_collected();
};
};
template <>
struct metric_traits<metric_id::global_store_efficiency>
{
static constexpr const char *metric_name =
"smsp__sass_average_data_bytes_per_sector_mem_global_op_st.pct";
static constexpr const char *summary = "Global Store Efficiency (HW)";
static constexpr const char *hint = "percentage";
static constexpr const char *short_name = "StoreEff";
static constexpr const char *description =
"Ratio of requested global memory store throughput to required global "
"memory store throughput expressed as percentage.";
static constexpr double divider = 100.0;
static bool is_collected(nvbench::state &m_state)
{
return m_state.is_stores_efficiency_collected();
};
};
template <>
struct metric_traits<metric_id::l1_hit_rate>
{
static constexpr const char *metric_name = "l1tex__t_sector_hit_rate.pct";
static constexpr const char *summary = "L1 Cache Hit Rate (HW)";
static constexpr const char *hint = "percentage";
static constexpr const char *short_name = "L1HitRate";
static constexpr const char *description = "Hit rate at L1 cache.";
static constexpr double divider = 100.0;
static bool is_collected(nvbench::state &m_state)
{
return m_state.is_l1_hit_rate_collected();
};
};
template <>
struct metric_traits<metric_id::l2_hit_rate>
{
static constexpr const char *metric_name = "lts__t_sector_hit_rate.pct";
static constexpr const char *summary = "L2 Cache Hit Rate (HW)";
static constexpr const char *hint = "percentage";
static constexpr const char *short_name = "L2HitRate";
static constexpr const char *description = "Hit rate at L2 cache.";
static constexpr double divider = 100.0;
static bool is_collected(nvbench::state &m_state)
{
return m_state.is_l2_hit_rate_collected();
};
};
template <metric_id id = metric_id::dram_peak_sustained_throughput>
void add_metrics_impl(nvbench::state &state, std::vector<std::string> &metrics)
{
if (metric_traits<id>::is_collected(state))
{
metrics.emplace_back(metric_traits<id>::metric_name);
}
constexpr auto next_id = static_cast<metric_id>(static_cast<int>(id) + 1);
add_metrics_impl<next_id>(state, metrics);
}
template <>
void add_metrics_impl<metric_id::count>(nvbench::state &,
std::vector<std::string> &)
{}
std::vector<std::string> add_metrics(nvbench::state &state)
{
std::vector<std::string> metrics;
metrics.reserve(static_cast<int>(metric_id::count));
add_metrics_impl(state, metrics);
return metrics;
}
} // namespace
measure_cupti_base::measure_cupti_base(state &exec_state)
try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state))
{}
catch (const std::exception &ex)
{
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
printer_opt_ref)
{
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::warn,
fmt::format("CUPTI failed to construct profiler: {}",
ex.what()));
}
}
void measure_cupti_base::check()
{
const auto device = m_state.get_device();
if (!device)
{
NVBENCH_THROW(std::runtime_error,
"{}",
"Device required for `cupti` measurement.");
}
if (!device->is_active())
{ // This means something went wrong higher up. Throw an error.
NVBENCH_THROW(std::runtime_error,
"{}",
"Internal error: Current device is not active.");
}
}
namespace
{
template <metric_id id = metric_id::dram_peak_sustained_throughput>
void gen_summary(std::size_t result_id,
nvbench::state &m_state,
const std::vector<double> &result)
{
using metric = metric_traits<id>;
if (metric::is_collected(m_state))
{
auto &summ = m_state.add_summary(metric::summary);
summ.set_string("hint", metric::hint);
summ.set_string("short_name", metric::short_name);
summ.set_string("description", metric::description);
summ.set_float64("value", result[result_id++] / metric::divider);
}
constexpr auto next_id = static_cast<metric_id>(static_cast<int>(id) + 1);
gen_summary<next_id>(result_id, m_state, result);
}
template <>
void gen_summary<metric_id::count>(std::size_t,
nvbench::state &,
const std::vector<double> &)
{}
void gen_summaries(nvbench::state &state, const std::vector<double> &result)
{
gen_summary(0, state, result);
}
} // namespace
void measure_cupti_base::generate_summaries()
try
{
gen_summaries(m_state, m_cupti.get_counter_values());
}
catch (const std::exception &ex)
{
if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
printer_opt_ref)
{
auto &printer = printer_opt_ref.value().get();
printer.log(nvbench::log_level::warn,
fmt::format("CUPTI failed to generate the summary: {}",
ex.what()));
}
}
} // namespace nvbench::detail

View File

@@ -0,0 +1,148 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <nvbench/blocking_kernel.cuh>
#include <nvbench/config.cuh>
#include <nvbench/cpu_timer.cuh>
#include <nvbench/cuda_call.cuh>
#include <nvbench/cuda_timer.cuh>
#include <nvbench/cupti_profiler.cuh>
#include <nvbench/device_info.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/launch.cuh>
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
#include <nvbench/detail/l2flush.cuh>
#include <nvbench/detail/statistics.cuh>
#include <cuda_runtime.h>
#include <algorithm>
#include <utility>
#include <vector>
namespace nvbench
{
struct state;
namespace detail
{
// non-templated code goes here:
struct measure_cupti_base
{
explicit measure_cupti_base(nvbench::state &exec_state);
measure_cupti_base(const measure_cupti_base &) = delete;
measure_cupti_base(measure_cupti_base &&) = delete;
measure_cupti_base &operator=(const measure_cupti_base &) = delete;
measure_cupti_base &operator=(measure_cupti_base &&) = delete;
protected:
struct kernel_launch_timer;
void check();
void generate_summaries();
__forceinline__ void flush_device_l2()
{
m_l2flush.flush(m_launch.get_stream());
}
__forceinline__ void sync_stream() const
{
NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream()));
}
nvbench::state &m_state;
nvbench::launch m_launch;
nvbench::detail::l2flush m_l2flush;
cupti_profiler m_cupti;
};
struct measure_cupti_base::kernel_launch_timer
{
explicit kernel_launch_timer(measure_cupti_base &measure)
: m_measure{measure}
{}
__forceinline__ void start()
{
m_measure.flush_device_l2();
m_measure.sync_stream();
if (m_measure.m_cupti.is_initialized())
{
m_measure.m_cupti.start_user_loop();
}
}
__forceinline__ void stop()
{
if (m_measure.m_cupti.is_initialized())
{
m_measure.m_cupti.stop_user_loop();
}
m_measure.sync_stream();
}
private:
measure_cupti_base &m_measure;
};
template <typename KernelLauncher>
struct measure_cupti : public measure_cupti_base
{
measure_cupti(nvbench::state &state, KernelLauncher &kernel_launcher)
: measure_cupti_base(state)
, m_kernel_launcher{kernel_launcher}
{}
void operator()()
{
this->check();
this->run();
this->generate_summaries();
}
private:
// Run the kernel as many times as CUPTI requires.
void run()
{
kernel_launch_timer timer(*this);
m_cupti.prepare_user_loop();
do
{
m_kernel_launcher(m_launch, timer);
} while (m_cupti.is_replay_required());
m_cupti.process_user_loop();
}
KernelLauncher &m_kernel_launcher;
};
} // namespace detail
} // namespace nvbench

View File

@@ -23,10 +23,14 @@
"Do not include it directly."
#endif // NVBENCH_STATE_EXEC_GUARD
#include <nvbench/config.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/state.cuh>
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
#ifdef NVBENCH_HAS_CUPTI
#include <nvbench/detail/measure_cupti.cuh>
#endif // NVBENCH_HAS_CUPTI
#include <nvbench/detail/measure_cold.cuh>
#include <nvbench/detail/measure_hot.cuh>
@@ -83,6 +87,19 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
constexpr bool use_blocking_kernel = !(tags & no_block);
if constexpr (tags & timer)
{
// Estimate bandwidth here
#ifdef NVBENCH_HAS_CUPTI
if constexpr (!(modifier_tags & run_once))
{
if (this->is_cupti_required())
{
using measure_t = nvbench::detail::measure_cupti<KL>;
measure_t measure{*this, kernel_launcher};
measure();
}
}
#endif
using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
measure_t measure{*this, kernel_launcher};
measure();
@@ -90,9 +107,23 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
else
{ // Need to wrap the kernel launcher with a timer wrapper:
using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
wrapper_t wrapper{kernel_launcher};
// Estimate bandwidth here
#ifdef NVBENCH_HAS_CUPTI
if constexpr (!(modifier_tags & run_once))
{
if (this->is_cupti_required())
{
using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
measure_t measure{*this, wrapper};
measure();
}
}
#endif
using measure_t =
nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
wrapper_t wrapper{kernel_launcher};
measure_t measure(*this, wrapper);
measure();
}

View File

@@ -144,4 +144,20 @@ catch (nvml::call_failed &e)
}
#endif // NVBENCH_HAS_NVML
#ifdef NVBENCH_HAS_CUPTI
[[nodiscard]] CUcontext device_info::get_context() const
{
if (!is_active())
{
NVBENCH_THROW(std::runtime_error,
"{}",
"get_context is called for inactive device");
}
CUcontext cu_context;
NVBENCH_DRIVER_API_CALL(cuCtxGetCurrent(&cu_context));
return cu_context;
}
#endif
} // namespace nvbench

View File

@@ -18,12 +18,14 @@
#pragma once
#include <nvbench/config.cuh>
#include <nvbench/cuda_call.cuh>
#include <nvbench/detail/device_scope.cuh>
#include <cuda_runtime_api.h>
#include <cstdint> // CHAR_BIT
#include <stdexcept>
#include <string_view>
#include <utility>
@@ -67,6 +69,13 @@ struct device_info
void set_active() const
{
NVBENCH_CUDA_CALL(cudaSetDevice(m_id));
#ifdef NVBENCH_HAS_CUPTI
// cudaSetDevice doesn't initialize a context on the first call, so we have
// to force it. According to the documentation, if devPtr is 0, no
// operation is performed.
NVBENCH_CUDA_CALL(cudaFree(nullptr));
#endif
}
/// Enable or disable persistence mode.
@@ -200,6 +209,20 @@ struct device_info
/// @return True if ECC is enabled on this device.
[[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; }
/// @return True if CUPTI supports this device.
[[nodiscard]] bool is_cupti_supported() const
{
#ifdef NVBENCH_HAS_CUPTI
return m_prop.major >= 7;
#else
return false;
#endif
}
#ifdef NVBENCH_HAS_CUPTI
[[nodiscard]] CUcontext get_context() const;
#endif
/// @return A cached copy of the device's cudaDeviceProp.
[[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const
{

View File

@@ -20,6 +20,7 @@
#include <nvbench/benchmark_base.cuh>
#include <nvbench/benchmark_manager.cuh>
#include <nvbench/config.cuh>
#include <nvbench/cuda_call.cuh>
#include <nvbench/option_parser.cuh>
#include <nvbench/printer_base.cuh>
@@ -45,9 +46,16 @@
return 1; \
}
#ifdef NVBENCH_HAS_CUPTI
#define NVBENCH_INITIALIZE_DRIVER_API NVBENCH_DRIVER_API_CALL(cuInit(0));
#else
#define NVBENCH_INITIALIZE_DRIVER_API
#endif
#define NVBENCH_MAIN_BODY(argc, argv) \
do \
{ \
NVBENCH_INITIALIZE_DRIVER_API \
nvbench::option_parser parser; \
parser.parse(argc, argv); \
auto &printer = parser.get_printer(); \

View File

@@ -221,6 +221,51 @@ struct state
return m_benchmark;
}
void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; }
void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; }
void collect_stores_efficiency() { m_collect_stores_efficiency = true; }
void collect_loads_efficiency() { m_collect_loads_efficiency = true; }
void collect_dram_throughput() { m_collect_dram_throughput = true; }
void collect_cupti_metrics()
{
collect_l1_hit_rates();
collect_l2_hit_rates();
collect_stores_efficiency();
collect_loads_efficiency();
collect_dram_throughput();
}
[[nodiscard]] bool is_l1_hit_rate_collected() const
{
return m_collect_l1_hit_rates;
}
[[nodiscard]] bool is_l2_hit_rate_collected() const
{
return m_collect_l2_hit_rates;
}
[[nodiscard]] bool is_stores_efficiency_collected() const
{
return m_collect_stores_efficiency;
}
[[nodiscard]] bool is_loads_efficiency_collected() const
{
return m_collect_loads_efficiency;
}
[[nodiscard]] bool is_dram_throughput_collected() const
{
return m_collect_dram_throughput;
}
[[nodiscard]] bool is_cupti_required() const
{
return is_l2_hit_rate_collected()
|| is_l1_hit_rate_collected()
|| is_stores_efficiency_collected()
|| is_loads_efficiency_collected()
|| is_dram_throughput_collected();
}
summary &add_summary(std::string summary_name);
summary &add_summary(summary s);
[[nodiscard]] const summary &get_summary(std::string_view name) const;
@@ -279,6 +324,12 @@ private:
std::string m_skip_reason;
std::size_t m_element_count{};
std::size_t m_global_memory_rw_bytes{};
bool m_collect_l1_hit_rates{};
bool m_collect_l2_hit_rates{};
bool m_collect_stores_efficiency{};
bool m_collect_loads_efficiency{};
bool m_collect_dram_throughput{};
};
} // namespace nvbench

View File

@@ -7,4 +7,12 @@ find_package(NVBench)
add_executable(test_bench test_bench.cu)
target_link_libraries(test_bench PRIVATE nvbench::main)
enable_testing()
add_test(NAME test COMMAND "$<TARGET_FILE:test_bench>" --timeout 1)
add_test(NAME test_bench COMMAND "$<TARGET_FILE:test_bench>" --timeout 1)
# Need to add the CUPTI path to LD_LIBRARY_PATH to make sure CUPTI libraries
# are found at runtime:
if (UNIX AND TARGET nvbench::cupti)
get_property(cupti_lib_path TARGET nvbench::cupti PROPERTY IMPORTED_LOCATION)
cmake_path(GET cupti_lib_path PARENT_PATH cupti_lib_path)
set_property(TEST test_bench PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${cupti_lib_path}")
endif()