mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-04-20 06:48:53 +00:00
235 lines
7.2 KiB
Plaintext
235 lines
7.2 KiB
Plaintext
/*
|
|
* Copyright 2021 NVIDIA Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License.
|
|
*
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <nvbench/config.cuh>
|
|
#include <nvbench/cuda_call.cuh>
|
|
#include <nvbench/detail/device_scope.cuh>
|
|
|
|
#include <cuda_runtime_api.h>
|
|
|
|
#include <cstdint> // CHAR_BIT
|
|
#include <stdexcept>
|
|
#include <string_view>
|
|
#include <utility>
|
|
|
|
// forward declare this for internal storage
|
|
struct nvmlDevice_st;
|
|
|
|
namespace nvbench
|
|
{
|
|
|
|
namespace detail
|
|
{
|
|
int get_ptx_version(int);
|
|
} // namespace detail
|
|
|
|
struct device_info
|
|
{
|
|
explicit device_info(int device_id);
|
|
|
|
// Mainly used by unit tests:
|
|
device_info(int device_id, cudaDeviceProp prop)
|
|
: m_id{device_id}
|
|
, m_prop{prop}
|
|
{}
|
|
|
|
/// @return The device's id on the current system.
|
|
[[nodiscard]] int get_id() const { return m_id; }
|
|
|
|
/// @return The name of the device.
|
|
[[nodiscard]] std::string_view get_name() const { return std::string_view(m_prop.name); }
|
|
|
|
[[nodiscard]] bool is_active() const
|
|
{
|
|
int id{-1};
|
|
NVBENCH_CUDA_CALL(cudaGetDevice(&id));
|
|
return id == m_id;
|
|
}
|
|
|
|
void set_active() const
|
|
{
|
|
NVBENCH_CUDA_CALL(cudaSetDevice(m_id));
|
|
|
|
#ifdef NVBENCH_HAS_CUPTI
|
|
// cudaSetDevice doesn't initialize a context on the first call, so we have
|
|
// to force it. According to the documentation, if devPtr is 0, no
|
|
// operation is performed.
|
|
NVBENCH_CUDA_CALL(cudaFree(nullptr));
|
|
#endif
|
|
}
|
|
|
|
/// Enable or disable persistence mode.
|
|
/// @note Only supported on Linux.
|
|
/// @note Requires root / admin privileges.
|
|
void set_persistence_mode(bool state);
|
|
|
|
/// Symbolic values for special clock rates
|
|
enum class clock_rate
|
|
{
|
|
/// Unlock clocks
|
|
none,
|
|
/// Base TDP clock; Preferred for stable benchmarking
|
|
base,
|
|
/// Maximum clock rate
|
|
maximum
|
|
};
|
|
|
|
/// Lock GPU clocks to the specified rate.
|
|
/// @note Only supported on Volta+ (sm_70+) devices.
|
|
/// @note Requires root / admin privileges.
|
|
void lock_gpu_clocks(clock_rate rate);
|
|
|
|
/// @return The SM version of the current device as (major*100) + (minor*10).
|
|
[[nodiscard]] int get_sm_version() const { return m_prop.major * 100 + m_prop.minor * 10; }
|
|
|
|
/// @return The PTX version of the current device, e.g. sm_80 returns 800.
|
|
[[nodiscard]] __forceinline__ int get_ptx_version() const
|
|
{
|
|
return detail::get_ptx_version(m_id);
|
|
}
|
|
|
|
/// @return The default clock rate of the SM in Hz.
|
|
[[nodiscard]] std::size_t get_sm_default_clock_rate() const { return m_sm_default_clock_rate; }
|
|
|
|
/// @return The number of physical streaming multiprocessors on this device.
|
|
[[nodiscard]] int get_number_of_sms() const { return m_prop.multiProcessorCount; }
|
|
|
|
/// @return The maximum number of resident blocks per SM.
|
|
[[nodiscard]] int get_max_blocks_per_sm() const { return m_prop.maxBlocksPerMultiProcessor; }
|
|
|
|
/// @return The maximum number of resident threads per SM.
|
|
[[nodiscard]] int get_max_threads_per_sm() const { return m_prop.maxThreadsPerMultiProcessor; }
|
|
|
|
/// @return The maximum number of threads per block.
|
|
[[nodiscard]] int get_max_threads_per_block() const { return m_prop.maxThreadsPerBlock; }
|
|
|
|
/// @return The number of registers per SM.
|
|
[[nodiscard]] int get_registers_per_sm() const { return m_prop.regsPerMultiprocessor; }
|
|
|
|
/// @return The number of registers per block.
|
|
[[nodiscard]] int get_registers_per_block() const { return m_prop.regsPerBlock; }
|
|
|
|
/// @return The total number of bytes available in global memory.
|
|
[[nodiscard]] std::size_t get_global_memory_size() const { return m_prop.totalGlobalMem; }
|
|
|
|
struct memory_info
|
|
{
|
|
std::size_t bytes_free;
|
|
std::size_t bytes_total;
|
|
};
|
|
|
|
/// @return The size and usage of this device's global memory.
|
|
[[nodiscard]] memory_info get_global_memory_usage() const;
|
|
|
|
/// @return The peak clock rate of the global memory bus in Hz.
|
|
[[nodiscard]] std::size_t get_global_memory_bus_peak_clock_rate() const
|
|
{
|
|
return m_global_memory_bus_peak_clock_rate;
|
|
}
|
|
|
|
/// @return The width of the global memory bus in bits.
|
|
[[nodiscard]] int get_global_memory_bus_width() const { return m_prop.memoryBusWidth; }
|
|
|
|
//// @return The global memory bus bandwidth in bytes/sec.
|
|
[[nodiscard]] std::size_t get_global_memory_bus_bandwidth() const
|
|
{ // 2 is for DDR, CHAR_BITS to convert bus_width to bytes.
|
|
return 2 * this->get_global_memory_bus_peak_clock_rate() *
|
|
static_cast<std::size_t>(this->get_global_memory_bus_width() / CHAR_BIT);
|
|
}
|
|
|
|
/// @return The size of the L2 cache in bytes.
|
|
[[nodiscard]] std::size_t get_l2_cache_size() const
|
|
{
|
|
return static_cast<std::size_t>(m_prop.l2CacheSize);
|
|
}
|
|
|
|
/// @return The available amount of shared memory in bytes per SM.
|
|
[[nodiscard]] std::size_t get_shared_memory_per_sm() const
|
|
{
|
|
return m_prop.sharedMemPerMultiprocessor;
|
|
}
|
|
|
|
/// @return The available amount of shared memory in bytes per block.
|
|
[[nodiscard]] std::size_t get_shared_memory_per_block() const { return m_prop.sharedMemPerBlock; }
|
|
|
|
/// @return True if ECC is enabled on this device.
|
|
[[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; }
|
|
|
|
/// @return True if CUPTI supports this device.
|
|
[[nodiscard]] bool is_cupti_supported() const
|
|
{
|
|
#ifdef NVBENCH_HAS_CUPTI
|
|
return m_prop.major >= 7;
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
#ifdef NVBENCH_HAS_CUPTI
|
|
[[nodiscard]] CUcontext get_context() const;
|
|
#endif
|
|
|
|
/// @return A cached copy of the device's cudaDeviceProp.
|
|
[[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const { return m_prop; }
|
|
|
|
[[nodiscard]] bool operator<(const device_info &o) const { return m_id < o.m_id; }
|
|
[[nodiscard]] bool operator==(const device_info &o) const { return m_id == o.m_id; }
|
|
[[nodiscard]] bool operator!=(const device_info &o) const { return m_id != o.m_id; }
|
|
|
|
private:
|
|
int m_id;
|
|
cudaDeviceProp m_prop;
|
|
nvmlDevice_st *m_nvml_device;
|
|
|
|
std::size_t m_sm_default_clock_rate;
|
|
std::size_t m_global_memory_bus_peak_clock_rate;
|
|
};
|
|
|
|
// get_ptx_version implementation; this needs to stay in the header so it will
|
|
// pick up the downstream project's compilation settings.
|
|
// TODO this is fragile and will break when called from any library
|
|
// translation unit.
|
|
namespace detail
|
|
{
|
|
// Templated to workaround ODR issues since __global__functions cannot be marked
|
|
// inline.
|
|
template <typename>
|
|
__global__ void noop_kernel()
|
|
{}
|
|
|
|
inline const auto noop_kernel_ptr = &noop_kernel<void>;
|
|
|
|
[[nodiscard]] inline int get_ptx_version(int dev_id)
|
|
try
|
|
{
|
|
nvbench::detail::device_scope _{dev_id};
|
|
cudaFuncAttributes attr{};
|
|
NVBENCH_CUDA_CALL(cudaFuncGetAttributes(&attr, ((const void *)nvbench::detail::noop_kernel_ptr)));
|
|
return attr.ptxVersion * 10;
|
|
}
|
|
catch (...)
|
|
{ // Fail gracefully when no appropriate PTX is found for this device.
|
|
return -1;
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
} // namespace nvbench
|