diff --git a/.gitignore b/.gitignore index 567609b..d43a4f4 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ build/ +.idea +cmake-build-* diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d0268a..70d2d16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ project(NVBench nvbench_init_rapids_cmake() option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON) +option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ON) option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF) option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF) diff --git a/cmake/NVBenchCUPTI.cmake b/cmake/NVBenchCUPTI.cmake new file mode 100644 index 0000000..aad01ce --- /dev/null +++ b/cmake/NVBenchCUPTI.cmake @@ -0,0 +1,71 @@ +# Since this file is installed, we need to make sure that the CUDAToolkit has +# been found by consumers: +if (NOT TARGET CUDA::toolkit) + find_package(CUDAToolkit REQUIRED) +endif() + +set(nvbench_cupti_root "${CUDAToolkit_LIBRARY_ROOT}/extras/CUPTI") + +# The CUPTI targets in FindCUDAToolkit are broken: +# - The dll locations are not specified +# - Dependent libraries nvperf_* are not linked. +# So we create our own targets: +function(nvbench_add_cupti_dep dep_name) + string(TOLOWER ${dep_name} dep_name_lower) + string(TOUPPER ${dep_name} dep_name_upper) + + add_library(nvbench::${dep_name_lower} SHARED IMPORTED) + + if (WIN32) + # Attempt to locate the dll in the expected location. This is necessary + # because the CUPTI dll has a versioned suffix, so we can't directly search + # for it with find_file. + file(GLOB dep_dll_path "${nvbench_cupti_root}/lib64/${dep_name_lower}*dll") + cmake_path(GET dep_dll_path FILENAME dep_dll_filename) + + # If the dll was not found in the expected location, use a default filename as a user hint. + if (NOT dep_dll_filename) + set(dep_dll_filename ${dep_name_lower}.dll) + endif() + + # Use find_file to create a cache variable and mark the file as REQUIRED. + find_file(NVBench_${dep_name_upper}_DLL ${dep_dll_filename} REQUIRED + DOC "The full path to ${dep_name_lower}.dll from the CUDA Toolkit." + HINTS "${nvbench_cupti_root}/lib64/" + ) + mark_as_advanced(NVBench_${dep_name_upper}_DLL) + + # The .libs don't have suffixes, so we can just directly search for them. + find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower}.lib REQUIRED + DOC "The full path to ${dep_name_lower}.lib from the CUDA Toolkit." + HINTS "${nvbench_cupti_root}/lib64/" + ) + mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY) + + set_target_properties(nvbench::${dep_name_lower} PROPERTIES + IMPORTED_LOCATION "${NVBench_${dep_name_upper}_DLL}" + IMPORTED_IMPLIB "${NVBench_${dep_name_upper}_LIBRARY}" + ) + else() + find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED + DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit." + HINTS "${nvbench_cupti_root}/lib64" + ) + mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY) + + set_target_properties(nvbench::${dep_name_lower} PROPERTIES + IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}" + ) + endif() +endfunction() + +nvbench_add_cupti_dep(nvperf_target) +nvbench_add_cupti_dep(nvperf_host) +nvbench_add_cupti_dep(cupti) +target_link_libraries(nvbench::cupti INTERFACE + nvbench::nvperf_target + nvbench::nvperf_host +) +target_include_directories(nvbench::cupti INTERFACE + "${nvbench_cupti_root}/include" +) diff --git a/cmake/NVBenchDependencies.cmake b/cmake/NVBenchDependencies.cmake index 78e63a6..761210c 100644 --- a/cmake/NVBenchDependencies.cmake +++ b/cmake/NVBenchDependencies.cmake @@ -64,3 +64,10 @@ if (NVBench_ENABLE_NVML) include("${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake") list(APPEND ctk_libraries nvbench::nvml) endif() + +################################################################################ +# CUDAToolkit -> CUPTI +if (NVBench_ENABLE_CUPTI) + include("${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake") + list(APPEND ctk_libraries CUDA::cuda_driver nvbench::cupti) +endif() diff --git a/cmake/NVBenchDependentDlls.cmake b/cmake/NVBenchDependentDlls.cmake index 4ebf784..bd9270d 100644 --- a/cmake/NVBenchDependentDlls.cmake +++ b/cmake/NVBenchDependentDlls.cmake @@ -23,7 +23,9 @@ endif() function(nvbench_setup_dep_dlls target_name) # The custom command below fails when there aren't any runtime DLLs to copy, # so only enable it when a relevant dependency is enabled: - if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND NVBench_ENABLE_NVML) + if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND + (NVBench_ENABLE_NVML OR + NVBench_ENABLE_CUPTI)) add_custom_command(TARGET ${target_name} POST_BUILD COMMAND diff --git a/cmake/NVBenchExports.cmake b/cmake/NVBenchExports.cmake index f18afc1..b4f0f6e 100644 --- a/cmake/NVBenchExports.cmake +++ b/cmake/NVBenchExports.cmake @@ -11,6 +11,15 @@ macro(nvbench_generate_exports) ) endif() + if (NVBench_ENABLE_CUPTI) + string(APPEND nvbench_build_export_code_block + "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n" + ) + string(APPEND nvbench_install_export_code_block + "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n" + ) + endif() + rapids_export(BUILD NVBench EXPORT_SET nvbench-targets NAMESPACE "nvbench::" diff --git a/cmake/NVBenchInstallRules.cmake b/cmake/NVBenchInstallRules.cmake index d287219..3b98de2 100644 --- a/cmake/NVBenchInstallRules.cmake +++ b/cmake/NVBenchInstallRules.cmake @@ -39,6 +39,14 @@ if (NVBench_ENABLE_NVML) ) endif() +if (NVBench_ENABLE_CUPTI) + install( + FILES + "${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake" + DESTINATION "${config_install_location}" + ) +endif() + # Call with a list of library targets to generate install rules: function(nvbench_install_libraries) install(TARGETS ${ARGN} diff --git a/cmake/NVBenchWriteConfigHeader.cmake b/cmake/NVBenchWriteConfigHeader.cmake index a843bca..f0c0333 100644 --- a/cmake/NVBenchWriteConfigHeader.cmake +++ b/cmake/NVBenchWriteConfigHeader.cmake @@ -3,5 +3,9 @@ function(nvbench_write_config_header filepath) set(NVBENCH_HAS_NVML 1) endif() + if (NVBench_ENABLE_CUPTI) + set(NVBENCH_HAS_CUPTI 1) + endif() + configure_file("${NVBench_SOURCE_DIR}/cmake/config.cuh.in" "${filepath}") endfunction() diff --git a/cmake/config.cuh.in b/cmake/config.cuh.in index f3e98cf..e04cce1 100644 --- a/cmake/config.cuh.in +++ b/cmake/config.cuh.in @@ -20,3 +20,6 @@ // Defined if NVBench has been built with NVML support. #cmakedefine NVBENCH_HAS_NVML + +// Defined if NVBench has been built with CUPTI support. +#cmakedefine NVBENCH_HAS_CUPTI diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4e15877..34c8763 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -5,6 +5,7 @@ set(example_srcs exec_tag_timer.cu skip.cu throughput.cu + auto_throughput.cu ) # Metatarget for all examples: diff --git a/examples/auto_throughput.cu b/examples/auto_throughput.cu new file mode 100644 index 0000000..2e49e3c --- /dev/null +++ b/examples/auto_throughput.cu @@ -0,0 +1,87 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +// Thrust vectors simplify memory management: +#include + +template +__global__ void kernel(std::size_t stride, + std::size_t elements, + const nvbench::int32_t * __restrict__ in, + nvbench::int32_t *__restrict__ out) +{ + const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const std::size_t step = gridDim.x * blockDim.x; + + for (std::size_t i = stride * tid; + i < stride * elements; + i += stride * step) + { + for (int j = 0; j < ItemsPerThread; j++) + { + const auto read_id = (ItemsPerThread * i + j) % elements; + const auto write_id = tid + j * elements; + out[write_id] = in[read_id]; + } + } +} + + +// `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput +// and cache hit rates. +// +// Calling state.collect_*() enables particular metric collection if nvbench +// was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON). +template +void throughput_bench(nvbench::state &state, + nvbench::type_list>) +{ + // Allocate input data: + const std::size_t stride = static_cast(state.get_int64("Stride")); + const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t); + thrust::device_vector input(elements); + thrust::device_vector output(elements * ItemsPerThread); + + // Provide throughput information: + state.add_element_count(elements, "Elements"); + state.collect_dram_throughput(); + state.collect_l1_hit_rates(); + state.collect_l2_hit_rates(); + state.collect_loads_efficiency(); + state.collect_stores_efficiency(); + + const auto threads_in_block = 256; + const auto blocks_in_grid = (elements + threads_in_block - 1) / + threads_in_block; + + state.exec([&](nvbench::launch &launch) { + kernel + <<>>( + stride, + elements, + thrust::raw_pointer_cast(input.data()), + thrust::raw_pointer_cast(output.data())); + }); +} + +using items_per_thread = nvbench::enum_type_list<1, 2>; + +NVBENCH_BENCH_TYPES(throughput_bench, NVBENCH_TYPE_AXES(items_per_thread)) + .add_int64_axis("Stride", nvbench::range(1, 4, 3)); diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt index 30b2b96..1aeb2f6 100644 --- a/nvbench/CMakeLists.txt +++ b/nvbench/CMakeLists.txt @@ -26,6 +26,10 @@ set(srcs detail/state_generator.cxx ) +if (NVBench_ENABLE_CUPTI) + list(APPEND srcs detail/measure_cupti.cu cupti_profiler.cxx) +endif() + if (NVBench_ENABLE_NVML) list(APPEND srcs internal/nvml.cxx) endif() diff --git a/nvbench/cuda_call.cu b/nvbench/cuda_call.cu index d54f705..6cb304b 100644 --- a/nvbench/cuda_call.cu +++ b/nvbench/cuda_call.cu @@ -17,6 +17,7 @@ */ #include +#include #include @@ -44,6 +45,34 @@ void throw_error(const std::string &filename, command)); } +#ifdef NVBENCH_HAS_CUPTI +void throw_error(const std::string &filename, + std::size_t lineno, + const std::string &command, + CUresult error_code) +{ + const char *name = nullptr; + cuGetErrorName(error_code, &name); + + const char *string = nullptr; + cuGetErrorString(error_code, &string); + + throw std::runtime_error(fmt::format("{}:{}: Driver API call returned error: " + "{}: {}\nCommand: '{}'", + filename, + lineno, + name, + string, + command)); +} +#else +void throw_error(const std::string &, + std::size_t, + const std::string &, + CUresult) +{} +#endif + void exit_error(const std::string &filename, std::size_t lineno, const std::string &command, diff --git a/nvbench/cuda_call.cuh b/nvbench/cuda_call.cuh index 76dada6..f1d6c45 100644 --- a/nvbench/cuda_call.cuh +++ b/nvbench/cuda_call.cuh @@ -19,6 +19,7 @@ #pragma once #include +#include #include @@ -36,6 +37,20 @@ } \ } while (false) +/// Throws a std::runtime_error if `call` doesn't return `CUDA_SUCCESS`. +#define NVBENCH_DRIVER_API_CALL(call) \ + do \ + { \ + const CUresult nvbench_cuda_call_error = call; \ + if (nvbench_cuda_call_error != CUDA_SUCCESS) \ + { \ + nvbench::cuda_call::throw_error(__FILE__, \ + __LINE__, \ + #call, \ + nvbench_cuda_call_error); \ + } \ + } while (false) + /// Terminates process with failure status if `call` doesn't return /// `cudaSuccess`. #define NVBENCH_CUDA_CALL_NOEXCEPT(call) \ @@ -59,6 +74,11 @@ void throw_error(const std::string &filename, const std::string &call, cudaError_t error); +void throw_error(const std::string &filename, + std::size_t lineno, + const std::string &call, + CUresult error); + void exit_error(const std::string &filename, std::size_t lineno, const std::string &command, diff --git a/nvbench/cupti_profiler.cuh b/nvbench/cupti_profiler.cuh new file mode 100644 index 0000000..6e0e255 --- /dev/null +++ b/nvbench/cupti_profiler.cuh @@ -0,0 +1,129 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace nvbench::detail +{ + + +#ifdef NVBENCH_HAS_CUPTI +/** + * Pass required metrics in the constructor and organize your code as follows + * to get counters back: + * + * ``` + * cupti_profiler cupti( + * nvbench::device_info{0}, + * { "l1tex__t_sector_hit_rate.pct" }); + * + * cupti->prepare_user_loop(); + * + * do + * { + * m_measure.m_cupti->start_user_loop(); + * + * kernel_1<<<1, 1>>>(); + * // ... + * kernel_n<<<1, 1>>>(); + * + * m_measure.m_cupti->stop_user_loop(); + * } while(cupti->is_replay_required()); + * + * cupti->process_user_loop(); + * + * auto result = m_cupti->get_counter_values(); + * ``` + * + * Check Perfworks Metric table here for the full list of metrics: + * https://docs.nvidia.com/cupti/r_main.html#metrics-reference-7x + */ +class cupti_profiler +{ + bool m_available {}; + std::string m_chip_name; + + // Counter data + std::vector m_metric_names; + std::vector m_data_image_prefix; + std::vector m_config_image; + std::vector m_data_image; + std::vector m_data_scratch_buffer; + std::vector m_availability_image; + nvbench::device_info m_device; + + // CUPTI runs a series of replay passes, where each pass contains a sequence + // of ranges. Every metric enabled in the configuration is collected + // separately per unique range in the pass. CUPTI supports auto and + // user-defined ranges. With auto range mode, ranges are defined around each + // kernel automatically. In the user range mode, ranges are defined manually. + // We define a single user range for the whole measurement. + static const int m_num_ranges = 1; + +public: + // Move only + cupti_profiler(cupti_profiler &&) noexcept; + cupti_profiler &operator=(cupti_profiler &&) noexcept; + + cupti_profiler(const cupti_profiler &) = delete; + cupti_profiler &operator=(const cupti_profiler &) = delete; + + cupti_profiler(nvbench::device_info device, + std::vector &&metric_names); + ~cupti_profiler(); + + [[nodiscard]] bool is_initialized() const; + + /// Should be called before replay loop + void prepare_user_loop(); + + /// Should be called before any kernel calls in the replay loop + void start_user_loop(); + + /// Should be called after all kernel calls in the replay loop + void stop_user_loop(); + + /// Should be called after the replay loop + void process_user_loop(); + + /// Indicates whether another iteration of the replay loop is required + [[nodiscard]] bool is_replay_required(); + + /// Returns counters for metrics requested in the constructor + [[nodiscard]] std::vector get_counter_values(); + +private: + void initialize_profiler(); + void initialize_chip_name(); + void initialize_availability_image(); + static void initialize_nvpw(); + void initialize_config_image(); + void initialize_counter_data_prefix_image(); + void initialize_counter_data_image(); +}; +#endif + + +} // namespace nvbench::detail diff --git a/nvbench/cupti_profiler.cxx b/nvbench/cupti_profiler.cxx new file mode 100644 index 0000000..63f3fbb --- /dev/null +++ b/nvbench/cupti_profiler.cxx @@ -0,0 +1,775 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace nvbench::detail +{ + +namespace +{ + +void cupti_call(const CUptiResult status) +{ + if (status != CUPTI_SUCCESS) + { + const char *errstr{}; + cuptiGetResultString(status, &errstr); + + throw std::runtime_error( + fmt::format("CUPTI call returned error: {}\n", errstr)); + } +} + +void nvpw_call(const NVPA_Status status) +{ + if (status != NVPA_STATUS_SUCCESS) + { + throw std::runtime_error( + fmt::format("NVPW call returned error: {}\n", status)); + } +} + +} // namespace + +cupti_profiler::cupti_profiler(nvbench::device_info device, + std::vector &&metric_names) + : m_metric_names(metric_names) + , m_device(device) +{ + initialize_profiler(); + initialize_chip_name(); + initialize_availability_image(); + initialize_nvpw(); + initialize_config_image(); + initialize_counter_data_prefix_image(); + initialize_counter_data_image(); + + m_available = true; +} + +cupti_profiler::cupti_profiler(cupti_profiler &&rhs) noexcept + : m_device(rhs.m_device.get_id(), rhs.m_device.get_cuda_device_prop()) +{ + (*this) = std::move(rhs); +} + +cupti_profiler &cupti_profiler::operator=(cupti_profiler &&rhs) noexcept +{ + m_device = rhs.m_device; + m_available = rhs.m_available; + m_chip_name = std::move(rhs.m_chip_name); + m_metric_names = std::move(rhs.m_metric_names); + m_data_image_prefix = std::move(rhs.m_data_image_prefix); + m_config_image = std::move(rhs.m_config_image); + m_data_image = std::move(rhs.m_data_image); + m_data_scratch_buffer = std::move(rhs.m_data_scratch_buffer); + m_availability_image = std::move(rhs.m_availability_image); + + rhs.m_available = false; + + return *this; +} + +void cupti_profiler::initialize_profiler() +{ + if (!m_device.is_cupti_supported()) + { + throw std::runtime_error(fmt::format("Device: {} isn't supported (CC {})", + m_device.get_id(), + m_device.get_sm_version())); + } + + CUpti_Profiler_Initialize_Params params = { + CUpti_Profiler_Initialize_Params_STRUCT_SIZE}; + cupti_call(cuptiProfilerInitialize(¶ms)); +} + +void cupti_profiler::initialize_chip_name() +{ + CUpti_Device_GetChipName_Params params = { + CUpti_Device_GetChipName_Params_STRUCT_SIZE}; + params.deviceIndex = m_device.get_id(); + cupti_call(cuptiDeviceGetChipName(¶ms)); + + m_chip_name = std::string(params.pChipName); +} + +void cupti_profiler::initialize_availability_image() +{ + CUpti_Profiler_GetCounterAvailability_Params params = { + CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE}; + + params.ctx = m_device.get_context(); + + cupti_call(cuptiProfilerGetCounterAvailability(¶ms)); + + m_availability_image.clear(); + m_availability_image.resize(params.counterAvailabilityImageSize); + params.pCounterAvailabilityImage = m_availability_image.data(); + + cupti_call(cuptiProfilerGetCounterAvailability(¶ms)); +} + +void cupti_profiler::initialize_nvpw() +{ + NVPW_InitializeHost_Params params = {NVPW_InitializeHost_Params_STRUCT_SIZE}; + + nvpw_call(NVPW_InitializeHost(¶ms)); +} + +namespace +{ + +class eval_request +{ + NVPW_MetricsEvaluator *evaluator_ptr; + +public: + eval_request(NVPW_MetricsEvaluator *evaluator_ptr, + const std::string &metric_name) + : evaluator_ptr(evaluator_ptr) + { + NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params = { + NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE}; + + params.pMetricsEvaluator = evaluator_ptr; + params.pMetricName = metric_name.c_str(); + params.pMetricEvalRequest = &request; + params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE; + + nvpw_call( + NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(¶ms)); + } + + [[nodiscard]] std::vector get_raw_dependencies() + { + std::vector raw_dependencies; + + NVPW_MetricsEvaluator_GetMetricRawDependencies_Params params = { + NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE}; + + params.pMetricsEvaluator = evaluator_ptr; + params.pMetricEvalRequests = &request; + params.numMetricEvalRequests = 1; + params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE; + params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest); + + nvpw_call(NVPW_MetricsEvaluator_GetMetricRawDependencies(¶ms)); + + raw_dependencies.resize(params.numRawDependencies); + params.ppRawDependencies = raw_dependencies.data(); + + nvpw_call(NVPW_MetricsEvaluator_GetMetricRawDependencies(¶ms)); + + return raw_dependencies; + } + + NVPW_MetricEvalRequest request; +}; + +class metric_evaluator +{ + bool initialized{}; + NVPW_MetricsEvaluator *evaluator_ptr; + std::vector scratch_buffer; + +public: + metric_evaluator(const std::string &chip_name, + const std::uint8_t *counter_availability_image = nullptr, + const std::uint8_t *counter_data_image = nullptr, + const std::size_t counter_data_image_size = 0) + { + NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params + scratch_buffer_param = { + NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE}; + + scratch_buffer_param.pChipName = chip_name.c_str(); + scratch_buffer_param.pCounterAvailabilityImage = counter_availability_image; + + nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize( + &scratch_buffer_param)); + + scratch_buffer.resize(scratch_buffer_param.scratchBufferSize); + + NVPW_CUDA_MetricsEvaluator_Initialize_Params evaluator_params = { + NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE}; + + evaluator_params.scratchBufferSize = scratch_buffer.size(); + evaluator_params.pScratchBuffer = scratch_buffer.data(); + evaluator_params.pChipName = chip_name.c_str(); + evaluator_params.pCounterAvailabilityImage = counter_availability_image; + evaluator_params.pCounterDataImage = counter_data_image; + evaluator_params.counterDataImageSize = counter_data_image_size; + + nvpw_call(NVPW_CUDA_MetricsEvaluator_Initialize(&evaluator_params)); + + evaluator_ptr = evaluator_params.pMetricsEvaluator; + initialized = true; + } + + ~metric_evaluator() + { + if (initialized) + { + NVPW_MetricsEvaluator_Destroy_Params params = { + NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE}; + + params.pMetricsEvaluator = evaluator_ptr; + + nvpw_call(NVPW_MetricsEvaluator_Destroy(¶ms)); + } + } + + [[nodiscard]] eval_request create_request(const std::string &metric_name) + { + return {evaluator_ptr, metric_name}; + } + + [[nodiscard]] operator NVPW_MetricsEvaluator *() const + { + return evaluator_ptr; + } +}; + +} // namespace + +namespace +{ + +[[nodiscard]] std::vector get_raw_metric_requests( + const std::string &chip_name, + const std::vector &metric_names, + const std::uint8_t *counter_availability_image = nullptr) +{ + metric_evaluator evaluator(chip_name, counter_availability_image); + + std::vector raw_metric_names; + raw_metric_names.reserve(metric_names.size()); + + for (auto &metric_name : metric_names) + { + for (auto &raw_dependency : + evaluator.create_request(metric_name).get_raw_dependencies()) + { + raw_metric_names.push_back(raw_dependency); + } + } + + std::vector raw_requests; + raw_requests.reserve(raw_metric_names.size()); + + for (auto &raw_name : raw_metric_names) + { + NVPA_RawMetricRequest metricRequest = {NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE}; + metricRequest.pMetricName = raw_name; + metricRequest.isolated = true; + metricRequest.keepInstances = true; + raw_requests.push_back(metricRequest); + } + + return raw_requests; +} + +class metrics_config +{ + bool initialized{}; + + void create(const std::string &chip_name, + const std::uint8_t *availability_image) + { + NVPW_CUDA_RawMetricsConfig_Create_V2_Params params = { + NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE}; + + params.activityKind = NVPA_ACTIVITY_KIND_PROFILER; + params.pChipName = chip_name.c_str(); + params.pCounterAvailabilityImage = availability_image; + + nvpw_call(NVPW_CUDA_RawMetricsConfig_Create_V2(¶ms)); + + raw_metrics_config = params.pRawMetricsConfig; + initialized = true; + } + + void set_availability_image(const std::uint8_t *availability_image) + { + NVPW_RawMetricsConfig_SetCounterAvailability_Params params = { + NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE}; + + params.pRawMetricsConfig = raw_metrics_config; + params.pCounterAvailabilityImage = availability_image; + + nvpw_call(NVPW_RawMetricsConfig_SetCounterAvailability(¶ms)); + } + + void begin_config_group() + { + NVPW_RawMetricsConfig_BeginPassGroup_Params params = { + NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE}; + + params.pRawMetricsConfig = raw_metrics_config; + + nvpw_call(NVPW_RawMetricsConfig_BeginPassGroup(¶ms)); + } + + void add_metrics(const std::vector &raw_metric_requests) + { + NVPW_RawMetricsConfig_AddMetrics_Params params = { + NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE}; + + params.pRawMetricsConfig = raw_metrics_config; + params.pRawMetricRequests = raw_metric_requests.data(); + params.numMetricRequests = raw_metric_requests.size(); + + nvpw_call(NVPW_RawMetricsConfig_AddMetrics(¶ms)); + } + + void end_config_group() + { + NVPW_RawMetricsConfig_EndPassGroup_Params params = { + NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE}; + + params.pRawMetricsConfig = raw_metrics_config; + + nvpw_call(NVPW_RawMetricsConfig_EndPassGroup(¶ms)); + } + + void generate() + { + NVPW_RawMetricsConfig_GenerateConfigImage_Params params = { + NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE}; + + params.pRawMetricsConfig = raw_metrics_config; + + nvpw_call(NVPW_RawMetricsConfig_GenerateConfigImage(¶ms)); + } + +public: + metrics_config(const std::string &chip_name, + const std::vector &raw_metric_requests, + const std::uint8_t *availability_image) + { + create(chip_name, availability_image); + set_availability_image(availability_image); + + begin_config_group(); + add_metrics(raw_metric_requests); + end_config_group(); + generate(); + } + + [[nodiscard]] std::vector get_config_image() + { + NVPW_RawMetricsConfig_GetConfigImage_Params params = { + NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE}; + + params.pRawMetricsConfig = raw_metrics_config; + params.bytesAllocated = 0; + params.pBuffer = nullptr; + + nvpw_call(NVPW_RawMetricsConfig_GetConfigImage(¶ms)); + + std::vector config_image(params.bytesCopied); + params.bytesAllocated = config_image.size(); + params.pBuffer = config_image.data(); + + nvpw_call(NVPW_RawMetricsConfig_GetConfigImage(¶ms)); + return config_image; + } + + ~metrics_config() + { + if (initialized) + { + NVPW_RawMetricsConfig_Destroy_Params params = { + NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE}; + + params.pRawMetricsConfig = raw_metrics_config; + + NVPW_RawMetricsConfig_Destroy(¶ms); + } + } + + NVPA_RawMetricsConfig *raw_metrics_config; +}; + +} // namespace + +void cupti_profiler::initialize_config_image() +{ + m_config_image = + metrics_config(m_chip_name, + get_raw_metric_requests(m_chip_name, + m_metric_names, + m_availability_image.data()), + m_availability_image.data()) + .get_config_image(); +} + +namespace +{ + +class counter_data_builder +{ + bool initialized{}; + +public: + counter_data_builder(const std::string &chip_name, + const std::uint8_t *pCounterAvailabilityImage) + { + NVPW_CUDA_CounterDataBuilder_Create_Params params = { + NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE}; + + params.pChipName = chip_name.c_str(); + params.pCounterAvailabilityImage = pCounterAvailabilityImage; + + nvpw_call(NVPW_CUDA_CounterDataBuilder_Create(¶ms)); + + builder = params.pCounterDataBuilder; + initialized = true; + } + + ~counter_data_builder() + { + if (initialized) + { + NVPW_CounterDataBuilder_Destroy_Params params = { + NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE}; + + params.pCounterDataBuilder = builder; + + NVPW_CounterDataBuilder_Destroy(¶ms); + } + } + + NVPA_CounterDataBuilder *builder; +}; + +} // namespace + +void cupti_profiler::initialize_counter_data_prefix_image() +{ + const std::uint8_t *counter_availability_image = nullptr; + + std::vector raw_metric_requests = + get_raw_metric_requests(m_chip_name, + m_metric_names, + counter_availability_image); + + counter_data_builder data_builder(m_chip_name, counter_availability_image); + + { + NVPW_CounterDataBuilder_AddMetrics_Params params = { + NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE}; + + params.pCounterDataBuilder = data_builder.builder; + params.pRawMetricRequests = raw_metric_requests.data(); + params.numMetricRequests = raw_metric_requests.size(); + + nvpw_call(NVPW_CounterDataBuilder_AddMetrics(¶ms)); + } + + { + NVPW_CounterDataBuilder_GetCounterDataPrefix_Params params = { + NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE}; + + params.pCounterDataBuilder = data_builder.builder; + params.bytesAllocated = 0; + params.pBuffer = nullptr; + + nvpw_call(NVPW_CounterDataBuilder_GetCounterDataPrefix(¶ms)); + + m_data_image_prefix.resize(params.bytesCopied); + params.bytesAllocated = m_data_image_prefix.size(); + params.pBuffer = m_data_image_prefix.data(); + + nvpw_call(NVPW_CounterDataBuilder_GetCounterDataPrefix(¶ms)); + } +} + +namespace +{ + +[[nodiscard]] std::size_t +get_counter_data_image_size(CUpti_Profiler_CounterDataImageOptions *options) +{ + CUpti_Profiler_CounterDataImage_CalculateSize_Params params = { + CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE}; + + params.pOptions = options; + params.sizeofCounterDataImageOptions = + CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; + + cupti_call(cuptiProfilerCounterDataImageCalculateSize(¶ms)); + return params.counterDataImageSize; +} + +} // namespace + +void cupti_profiler::initialize_counter_data_image() +{ + CUpti_Profiler_CounterDataImageOptions counter_data_image_options; + + counter_data_image_options.pCounterDataPrefix = &m_data_image_prefix[0]; + counter_data_image_options.counterDataPrefixSize = m_data_image_prefix.size(); + counter_data_image_options.maxNumRanges = m_num_ranges; + counter_data_image_options.maxNumRangeTreeNodes = m_num_ranges; + counter_data_image_options.maxRangeNameLength = 64; + + m_data_image.resize(get_counter_data_image_size(&counter_data_image_options)); + + { + CUpti_Profiler_CounterDataImage_Initialize_Params params = { + CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE}; + + params.sizeofCounterDataImageOptions = + CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; + params.pOptions = &counter_data_image_options; + params.counterDataImageSize = m_data_image.size(); + + params.pCounterDataImage = &m_data_image[0]; + cupti_call(cuptiProfilerCounterDataImageInitialize(¶ms)); + } + + { + CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params params = { + CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE}; + + params.counterDataImageSize = m_data_image.size(); + params.pCounterDataImage = &m_data_image[0]; + + cupti_call( + cuptiProfilerCounterDataImageCalculateScratchBufferSize(¶ms)); + + m_data_scratch_buffer.resize(params.counterDataScratchBufferSize); + } + + { + CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params params = { + CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE}; + + params.counterDataImageSize = m_data_image.size(); + params.pCounterDataImage = &m_data_image[0]; + params.counterDataScratchBufferSize = m_data_scratch_buffer.size(); + params.pCounterDataScratchBuffer = &m_data_scratch_buffer[0]; + + cupti_call(cuptiProfilerCounterDataImageInitializeScratchBuffer(¶ms)); + } +} + +cupti_profiler::~cupti_profiler() +{ + if (is_initialized()) + { + CUpti_Profiler_DeInitialize_Params params = { + CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE}; + + cuptiProfilerDeInitialize(¶ms); + } +} + +bool cupti_profiler::is_initialized() const +{ + return m_available; +} + +void cupti_profiler::prepare_user_loop() +{ + { + CUpti_Profiler_BeginSession_Params params = { + CUpti_Profiler_BeginSession_Params_STRUCT_SIZE}; + + params.ctx = nullptr; + params.counterDataImageSize = m_data_image.size(); + params.pCounterDataImage = &m_data_image[0]; + params.counterDataScratchBufferSize = m_data_scratch_buffer.size(); + params.pCounterDataScratchBuffer = &m_data_scratch_buffer[0]; + + // Each kernel is going to produce its own set of metrics + params.range = CUPTI_UserRange; + params.replayMode = CUPTI_UserReplay; + params.maxRangesPerPass = m_num_ranges; + params.maxLaunchesPerPass = m_num_ranges; + + cupti_call(cuptiProfilerBeginSession(¶ms)); + } + + { + CUpti_Profiler_SetConfig_Params params = { + CUpti_Profiler_SetConfig_Params_STRUCT_SIZE}; + + params.pConfig = &m_config_image[0]; + params.configSize = m_config_image.size(); + params.minNestingLevel = 1; + params.numNestingLevels = 1; + params.passIndex = 0; + + cupti_call(cuptiProfilerSetConfig(¶ms)); + } +} + +void cupti_profiler::start_user_loop() +{ + { + CUpti_Profiler_BeginPass_Params params = { + CUpti_Profiler_BeginPass_Params_STRUCT_SIZE}; + + cupti_call(cuptiProfilerBeginPass(¶ms)); + } + + { + CUpti_Profiler_EnableProfiling_Params params = { + CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE}; + + cupti_call(cuptiProfilerEnableProfiling(¶ms)); + } + + { + CUpti_Profiler_PushRange_Params params = { + CUpti_Profiler_PushRange_Params_STRUCT_SIZE}; + + std::string rangeName = "nvbench"; + params.pRangeName = rangeName.c_str(); + + cupti_call(cuptiProfilerPushRange(¶ms)); + } +} + +void cupti_profiler::stop_user_loop() +{ + { + CUpti_Profiler_PopRange_Params params = { + CUpti_Profiler_PopRange_Params_STRUCT_SIZE}; + + cupti_call(cuptiProfilerPopRange(¶ms)); + } + + { + CUpti_Profiler_DisableProfiling_Params params = { + CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE}; + + cupti_call(cuptiProfilerDisableProfiling(¶ms)); + } +} + +bool cupti_profiler::is_replay_required() +{ + CUpti_Profiler_EndPass_Params params = { + CUpti_Profiler_EndPass_Params_STRUCT_SIZE}; + + cupti_call(cuptiProfilerEndPass(¶ms)); + + return !params.allPassesSubmitted; +} + +void cupti_profiler::process_user_loop() +{ + { + CUpti_Profiler_FlushCounterData_Params params = { + CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE}; + + cupti_call(cuptiProfilerFlushCounterData(¶ms)); + } + + { + CUpti_Profiler_UnsetConfig_Params params = { + CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE}; + + cupti_call(cuptiProfilerUnsetConfig(¶ms)); + } + + { + CUpti_Profiler_EndSession_Params params = { + CUpti_Profiler_EndSession_Params_STRUCT_SIZE}; + + cupti_call(cuptiProfilerEndSession(¶ms)); + } +} + +std::vector cupti_profiler::get_counter_values() +{ + metric_evaluator evaluator(m_chip_name, + m_availability_image.data(), + m_data_image.data(), + m_data_image.size()); + + { + NVPW_CounterData_GetNumRanges_Params params = { + NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE}; + + params.pCounterDataImage = m_data_image.data(); + nvpw_call(NVPW_CounterData_GetNumRanges(¶ms)); + + if (params.numRanges != 1) + { + throw std::runtime_error("Something's gone wrong, one range is expected"); + } + } + + std::size_t range_id{}; // there's only one range + std::size_t result_id{}; + std::vector result(m_metric_names.size()); + + for (const std::string &metric_name : m_metric_names) + { + eval_request request = evaluator.create_request(metric_name); + + { + NVPW_MetricsEvaluator_SetDeviceAttributes_Params params = { + NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE}; + + params.pMetricsEvaluator = evaluator; + params.pCounterDataImage = m_data_image.data(); + params.counterDataImageSize = m_data_image.size(); + + nvpw_call(NVPW_MetricsEvaluator_SetDeviceAttributes(¶ms)); + } + + { + NVPW_MetricsEvaluator_EvaluateToGpuValues_Params params = { + NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE}; + + params.pMetricsEvaluator = evaluator; + params.pMetricEvalRequests = &request.request; + params.numMetricEvalRequests = 1; + params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE; + params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest); + params.pCounterDataImage = m_data_image.data(); + params.counterDataImageSize = m_data_image.size(); + params.rangeIndex = range_id; + params.isolated = true; + params.pMetricValues = &result[result_id++]; + + nvpw_call(NVPW_MetricsEvaluator_EvaluateToGpuValues(¶ms)); + } + } + + return result; +} + +} // namespace nvbench::detail diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index f80c2d2..25bcd09 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -42,7 +42,7 @@ measure_cold_base::measure_cold_base(state &exec_state) , m_min_time{exec_state.get_min_time()} , m_skip_time{exec_state.get_skip_time()} , m_timeout{exec_state.get_timeout()} -{} +{ } void measure_cold_base::check() { diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index b1ea953..556d043 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -185,6 +185,7 @@ private: } kernel_launch_timer timer(*this); + this->launch_kernel(timer); this->check_skip_time(m_cuda_timer.get_duration()); } diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu new file mode 100644 index 0000000..e2952fd --- /dev/null +++ b/nvbench/detail/measure_cupti.cu @@ -0,0 +1,263 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace nvbench::detail +{ + +namespace +{ + +enum class metric_id : int +{ + dram_peak_sustained_throughput = 0, + global_load_efficiency, + global_store_efficiency, + l1_hit_rate, + l2_hit_rate, + + count +}; + +template +struct metric_traits; + +template <> +struct metric_traits +{ + static constexpr const char *metric_name = + "dram__throughput.avg.pct_of_peak_sustained_elapsed"; + + static constexpr const char *summary = + "Peak Sustained Global Memory Throughput (HW)"; + + static constexpr const char *hint = "percentage"; + static constexpr const char *short_name = "HBWPeak"; + + static constexpr const char *description = + "The utilization level of the device memory relative to the peak " + "utilization."; + + static constexpr double divider = 100.0; + + static bool is_collected(nvbench::state &m_state) + { + return m_state.is_dram_throughput_collected(); + }; +}; + +template <> +struct metric_traits +{ + static constexpr const char *metric_name = + "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct"; + + static constexpr const char *summary = "Global Load Efficiency (HW)"; + static constexpr const char *hint = "percentage"; + static constexpr const char *short_name = "LoadEff"; + + static constexpr const char *description = + "Ratio of requested global memory load throughput to required global " + "memory load throughput expressed as percentage."; + + static constexpr double divider = 100.0; + + static bool is_collected(nvbench::state &m_state) + { + return m_state.is_loads_efficiency_collected(); + }; +}; + +template <> +struct metric_traits +{ + static constexpr const char *metric_name = + "smsp__sass_average_data_bytes_per_sector_mem_global_op_st.pct"; + + static constexpr const char *summary = "Global Store Efficiency (HW)"; + static constexpr const char *hint = "percentage"; + static constexpr const char *short_name = "StoreEff"; + + static constexpr const char *description = + "Ratio of requested global memory store throughput to required global " + "memory store throughput expressed as percentage."; + + static constexpr double divider = 100.0; + + static bool is_collected(nvbench::state &m_state) + { + return m_state.is_stores_efficiency_collected(); + }; +}; + +template <> +struct metric_traits +{ + static constexpr const char *metric_name = "l1tex__t_sector_hit_rate.pct"; + static constexpr const char *summary = "L1 Cache Hit Rate (HW)"; + static constexpr const char *hint = "percentage"; + static constexpr const char *short_name = "L1HitRate"; + static constexpr const char *description = "Hit rate at L1 cache."; + static constexpr double divider = 100.0; + + static bool is_collected(nvbench::state &m_state) + { + return m_state.is_l1_hit_rate_collected(); + }; +}; + +template <> +struct metric_traits +{ + static constexpr const char *metric_name = "lts__t_sector_hit_rate.pct"; + static constexpr const char *summary = "L2 Cache Hit Rate (HW)"; + static constexpr const char *hint = "percentage"; + static constexpr const char *short_name = "L2HitRate"; + static constexpr const char *description = "Hit rate at L2 cache."; + static constexpr double divider = 100.0; + + static bool is_collected(nvbench::state &m_state) + { + return m_state.is_l2_hit_rate_collected(); + }; +}; + +template +void add_metrics_impl(nvbench::state &state, std::vector &metrics) +{ + if (metric_traits::is_collected(state)) + { + metrics.emplace_back(metric_traits::metric_name); + } + + constexpr auto next_id = static_cast(static_cast(id) + 1); + add_metrics_impl(state, metrics); +} + +template <> +void add_metrics_impl(nvbench::state &, + std::vector &) +{} + +std::vector add_metrics(nvbench::state &state) +{ + std::vector metrics; + metrics.reserve(static_cast(metric_id::count)); + + add_metrics_impl(state, metrics); + return metrics; +} + +} // namespace + +measure_cupti_base::measure_cupti_base(state &exec_state) +try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state)) +{} +catch (const std::exception &ex) +{ + if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); + printer_opt_ref) + { + auto &printer = printer_opt_ref.value().get(); + printer.log(nvbench::log_level::warn, + fmt::format("CUPTI failed to construct profiler: {}", + ex.what())); + } +} + +void measure_cupti_base::check() +{ + const auto device = m_state.get_device(); + if (!device) + { + NVBENCH_THROW(std::runtime_error, + "{}", + "Device required for `cupti` measurement."); + } + if (!device->is_active()) + { // This means something went wrong higher up. Throw an error. + NVBENCH_THROW(std::runtime_error, + "{}", + "Internal error: Current device is not active."); + } +} + +namespace +{ + +template +void gen_summary(std::size_t result_id, + nvbench::state &m_state, + const std::vector &result) +{ + using metric = metric_traits; + + if (metric::is_collected(m_state)) + { + auto &summ = m_state.add_summary(metric::summary); + summ.set_string("hint", metric::hint); + summ.set_string("short_name", metric::short_name); + summ.set_string("description", metric::description); + summ.set_float64("value", result[result_id++] / metric::divider); + } + + constexpr auto next_id = static_cast(static_cast(id) + 1); + gen_summary(result_id, m_state, result); +} + +template <> +void gen_summary(std::size_t, + nvbench::state &, + const std::vector &) +{} + +void gen_summaries(nvbench::state &state, const std::vector &result) +{ + gen_summary(0, state, result); +} + +} // namespace + +void measure_cupti_base::generate_summaries() +try +{ + gen_summaries(m_state, m_cupti.get_counter_values()); +} +catch (const std::exception &ex) +{ + if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); + printer_opt_ref) + { + auto &printer = printer_opt_ref.value().get(); + printer.log(nvbench::log_level::warn, + fmt::format("CUPTI failed to generate the summary: {}", + ex.what())); + } +} + +} // namespace nvbench::detail diff --git a/nvbench/detail/measure_cupti.cuh b/nvbench/detail/measure_cupti.cuh new file mode 100644 index 0000000..650190a --- /dev/null +++ b/nvbench/detail/measure_cupti.cuh @@ -0,0 +1,148 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace nvbench +{ + +struct state; + +namespace detail +{ + +// non-templated code goes here: +struct measure_cupti_base +{ + explicit measure_cupti_base(nvbench::state &exec_state); + measure_cupti_base(const measure_cupti_base &) = delete; + measure_cupti_base(measure_cupti_base &&) = delete; + measure_cupti_base &operator=(const measure_cupti_base &) = delete; + measure_cupti_base &operator=(measure_cupti_base &&) = delete; + +protected: + struct kernel_launch_timer; + + void check(); + void generate_summaries(); + + __forceinline__ void flush_device_l2() + { + m_l2flush.flush(m_launch.get_stream()); + } + + __forceinline__ void sync_stream() const + { + NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream())); + } + + nvbench::state &m_state; + + nvbench::launch m_launch; + nvbench::detail::l2flush m_l2flush; + + cupti_profiler m_cupti; +}; + +struct measure_cupti_base::kernel_launch_timer +{ + explicit kernel_launch_timer(measure_cupti_base &measure) + : m_measure{measure} + {} + + __forceinline__ void start() + { + m_measure.flush_device_l2(); + m_measure.sync_stream(); + + if (m_measure.m_cupti.is_initialized()) + { + m_measure.m_cupti.start_user_loop(); + } + } + + __forceinline__ void stop() + { + if (m_measure.m_cupti.is_initialized()) + { + m_measure.m_cupti.stop_user_loop(); + } + + m_measure.sync_stream(); + } + +private: + measure_cupti_base &m_measure; +}; + +template +struct measure_cupti : public measure_cupti_base +{ + measure_cupti(nvbench::state &state, KernelLauncher &kernel_launcher) + : measure_cupti_base(state) + , m_kernel_launcher{kernel_launcher} + {} + + void operator()() + { + this->check(); + this->run(); + this->generate_summaries(); + } + +private: + // Run the kernel as many times as CUPTI requires. + void run() + { + kernel_launch_timer timer(*this); + + m_cupti.prepare_user_loop(); + + do + { + m_kernel_launcher(m_launch, timer); + } while (m_cupti.is_replay_required()); + + m_cupti.process_user_loop(); + } + + KernelLauncher &m_kernel_launcher; +}; + +} // namespace detail +} // namespace nvbench diff --git a/nvbench/detail/state_exec.cuh b/nvbench/detail/state_exec.cuh index 7e6f209..edde96d 100644 --- a/nvbench/detail/state_exec.cuh +++ b/nvbench/detail/state_exec.cuh @@ -23,10 +23,14 @@ "Do not include it directly." #endif // NVBENCH_STATE_EXEC_GUARD +#include #include #include #include +#ifdef NVBENCH_HAS_CUPTI +#include +#endif // NVBENCH_HAS_CUPTI #include #include @@ -83,6 +87,19 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher) constexpr bool use_blocking_kernel = !(tags & no_block); if constexpr (tags & timer) { + // Estimate bandwidth here + #ifdef NVBENCH_HAS_CUPTI + if constexpr (!(modifier_tags & run_once)) + { + if (this->is_cupti_required()) + { + using measure_t = nvbench::detail::measure_cupti; + measure_t measure{*this, kernel_launcher}; + measure(); + } + } + #endif + using measure_t = nvbench::detail::measure_cold; measure_t measure{*this, kernel_launcher}; measure(); @@ -90,9 +107,23 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher) else { // Need to wrap the kernel launcher with a timer wrapper: using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper; + wrapper_t wrapper{kernel_launcher}; + + // Estimate bandwidth here + #ifdef NVBENCH_HAS_CUPTI + if constexpr (!(modifier_tags & run_once)) + { + if (this->is_cupti_required()) + { + using measure_t = nvbench::detail::measure_cupti; + measure_t measure{*this, wrapper}; + measure(); + } + } + #endif + using measure_t = nvbench::detail::measure_cold; - wrapper_t wrapper{kernel_launcher}; measure_t measure(*this, wrapper); measure(); } diff --git a/nvbench/device_info.cu b/nvbench/device_info.cu index 1550604..272b783 100644 --- a/nvbench/device_info.cu +++ b/nvbench/device_info.cu @@ -144,4 +144,20 @@ catch (nvml::call_failed &e) } #endif // NVBENCH_HAS_NVML +#ifdef NVBENCH_HAS_CUPTI +[[nodiscard]] CUcontext device_info::get_context() const +{ + if (!is_active()) + { + NVBENCH_THROW(std::runtime_error, + "{}", + "get_context is called for inactive device"); + } + + CUcontext cu_context; + NVBENCH_DRIVER_API_CALL(cuCtxGetCurrent(&cu_context)); + return cu_context; +} +#endif + } // namespace nvbench diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh index c960138..296a2c2 100644 --- a/nvbench/device_info.cuh +++ b/nvbench/device_info.cuh @@ -18,12 +18,14 @@ #pragma once +#include #include #include #include #include // CHAR_BIT +#include #include #include @@ -67,6 +69,13 @@ struct device_info void set_active() const { NVBENCH_CUDA_CALL(cudaSetDevice(m_id)); + +#ifdef NVBENCH_HAS_CUPTI + // cudaSetDevice doesn't initialize a context on the first call, so we have + // to force it. According to the documentation, if devPtr is 0, no + // operation is performed. + NVBENCH_CUDA_CALL(cudaFree(nullptr)); +#endif } /// Enable or disable persistence mode. @@ -200,6 +209,20 @@ struct device_info /// @return True if ECC is enabled on this device. [[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; } + /// @return True if CUPTI supports this device. + [[nodiscard]] bool is_cupti_supported() const + { +#ifdef NVBENCH_HAS_CUPTI + return m_prop.major >= 7; +#else + return false; +#endif + } + +#ifdef NVBENCH_HAS_CUPTI + [[nodiscard]] CUcontext get_context() const; +#endif + /// @return A cached copy of the device's cudaDeviceProp. [[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const { diff --git a/nvbench/main.cuh b/nvbench/main.cuh index 83a8672..a16128c 100644 --- a/nvbench/main.cuh +++ b/nvbench/main.cuh @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -45,9 +46,16 @@ return 1; \ } +#ifdef NVBENCH_HAS_CUPTI +#define NVBENCH_INITIALIZE_DRIVER_API NVBENCH_DRIVER_API_CALL(cuInit(0)); +#else +#define NVBENCH_INITIALIZE_DRIVER_API +#endif + #define NVBENCH_MAIN_BODY(argc, argv) \ do \ { \ + NVBENCH_INITIALIZE_DRIVER_API \ nvbench::option_parser parser; \ parser.parse(argc, argv); \ auto &printer = parser.get_printer(); \ diff --git a/nvbench/state.cuh b/nvbench/state.cuh index ebabab6..8cf3e9c 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -221,6 +221,51 @@ struct state return m_benchmark; } + void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; } + void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; } + void collect_stores_efficiency() { m_collect_stores_efficiency = true; } + void collect_loads_efficiency() { m_collect_loads_efficiency = true; } + void collect_dram_throughput() { m_collect_dram_throughput = true; } + + void collect_cupti_metrics() + { + collect_l1_hit_rates(); + collect_l2_hit_rates(); + collect_stores_efficiency(); + collect_loads_efficiency(); + collect_dram_throughput(); + } + + [[nodiscard]] bool is_l1_hit_rate_collected() const + { + return m_collect_l1_hit_rates; + } + [[nodiscard]] bool is_l2_hit_rate_collected() const + { + return m_collect_l2_hit_rates; + } + [[nodiscard]] bool is_stores_efficiency_collected() const + { + return m_collect_stores_efficiency; + } + [[nodiscard]] bool is_loads_efficiency_collected() const + { + return m_collect_loads_efficiency; + } + [[nodiscard]] bool is_dram_throughput_collected() const + { + return m_collect_dram_throughput; + } + + [[nodiscard]] bool is_cupti_required() const + { + return is_l2_hit_rate_collected() + || is_l1_hit_rate_collected() + || is_stores_efficiency_collected() + || is_loads_efficiency_collected() + || is_dram_throughput_collected(); + } + summary &add_summary(std::string summary_name); summary &add_summary(summary s); [[nodiscard]] const summary &get_summary(std::string_view name) const; @@ -279,6 +324,12 @@ private: std::string m_skip_reason; std::size_t m_element_count{}; std::size_t m_global_memory_rw_bytes{}; + + bool m_collect_l1_hit_rates{}; + bool m_collect_l2_hit_rates{}; + bool m_collect_stores_efficiency{}; + bool m_collect_loads_efficiency{}; + bool m_collect_dram_throughput{}; }; } // namespace nvbench diff --git a/testing/cmake/test_export/CMakeLists.txt b/testing/cmake/test_export/CMakeLists.txt index 20d8b99..12442fe 100644 --- a/testing/cmake/test_export/CMakeLists.txt +++ b/testing/cmake/test_export/CMakeLists.txt @@ -7,4 +7,12 @@ find_package(NVBench) add_executable(test_bench test_bench.cu) target_link_libraries(test_bench PRIVATE nvbench::main) enable_testing() -add_test(NAME test COMMAND "$" --timeout 1) +add_test(NAME test_bench COMMAND "$" --timeout 1) + +# Need to add the CUPTI path to LD_LIBRARY_PATH to make sure CUPTI libraries +# are found at runtime: +if (UNIX AND TARGET nvbench::cupti) + get_property(cupti_lib_path TARGET nvbench::cupti PROPERTY IMPORTED_LOCATION) + cmake_path(GET cupti_lib_path PARENT_PATH cupti_lib_path) + set_property(TEST test_bench PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${cupti_lib_path}") +endif()