Files
composable_kernel/include/ck/library/utility/gpu_verification.hpp
Illia Silin c24e528481 [rocm-libraries] ROCm/rocm-libraries#7760 (commit a61bc76)
[CK] suppress compiler warnings while building pytorch. (#7760)

## Motivation

Recently added compiler flags that are required to suppress false
warnings by latest staging compiler are not recognized by older compiler
versions and are triggering an avalanche of warnings. Previous attempt
to suppress them by using -Wno-unknown-warning-option flag didn't help,
because that flag wasn't recognized either and just added more warnings.
I've verified that current approach by checking the clang version
actually works as intended and makes the warnings go away.

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->

## Test Plan

<!-- Explain any relevant testing done to verify this PR. -->

## Test Result

<!-- Briefly summarize test outcomes. -->

## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
2026-05-27 06:56:58 -07:00

527 lines
19 KiB
C++

// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <iomanip>
#include <iostream>
#include <tuple>
#include <type_traits>
#include <cmath>
#include <array>
#include "ck/utility/data_type.hpp"
#include "ck/utility/type_convert.hpp"
#include "ck/utility/type.hpp"
#include "ck/utility/env.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/library/utility/check_err.hpp"
#if __clang_major__ >= 23
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
#endif
namespace ck {
namespace profiler {
// Result struct for GPU verification with detailed error reporting
// Provides backward compatibility via operator bool()
struct GpuVerifyResult
{
unsigned long long error_count; // Number of elements that exceeded tolerance
float max_error; // Maximum error value observed
std::size_t total; // Total number of elements compared
bool all_zero; // True if device result is all zeros (likely kernel issue)
// Implicit conversion to bool for backward compatibility
// Allows: if (gpu_verify(...)) { ... }
operator bool() const { return error_count == 0; }
// Calculate error percentage
float error_percentage() const
{
if(total == 0)
return 0.0f;
return static_cast<float>(error_count) / static_cast<float>(total) * 100.0f;
}
// Print error summary to stderr (matches check_err format)
void print_error_summary() const
{
if(error_count > 0)
{
if(all_zero)
{
std::cerr << "WARNING: Device result is all zeros - kernel may not have executed "
"properly!"
<< std::endl;
}
std::cerr << "max err: " << max_error;
std::cerr << ", number of errors: " << error_count;
std::cerr << ", " << std::setprecision(2) << std::fixed << error_percentage()
<< "% wrong values" << std::endl;
}
}
};
// Compute relative tolerance for GPU verification
// Matches the logic of ck::utils::get_relative_threshold but handles all types
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
inline float compute_relative_tolerance(const int number_of_accumulations = 1)
{
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
using I8 = int8_t;
using I16 = int16_t;
using I32 = int32_t;
// For integer types, tolerance is 0
if constexpr(std::is_same_v<ComputeDataType, I8> || std::is_same_v<ComputeDataType, I16> ||
std::is_same_v<ComputeDataType, I32> || std::is_same_v<ComputeDataType, int>)
{
return 0.0f;
}
// For types supported by get_relative_threshold, use it
else if constexpr((std::is_same_v<ComputeDataType, F16> ||
std::is_same_v<ComputeDataType, BF16> ||
std::is_same_v<ComputeDataType, F32>) &&
(std::is_same_v<OutDataType, F16> || std::is_same_v<OutDataType, BF16> ||
std::is_same_v<OutDataType, F32>) &&
(std::is_same_v<AccDataType, F16> || std::is_same_v<AccDataType, BF16> ||
std::is_same_v<AccDataType, F32>))
{
return static_cast<float>(
ck::utils::get_relative_threshold<ComputeDataType, OutDataType, AccDataType>(
number_of_accumulations));
}
// For unsupported types (FP8, BF8, etc.), use default tolerances based on output type
else
{
if constexpr(std::is_same_v<OutDataType, F16>)
{
return 1e-3f;
}
else if constexpr(std::is_same_v<OutDataType, BF16>)
{
return 1e-1f;
}
else
{
// For FP8/BF8 and other types, use conservative tolerance
return 1e-1f;
}
}
}
/// @brief Turn an iterator type into an iterator that can be dereferenced.
///
/// In gpu_verify and gpu_reduce_max, it is valid to pass a void pointer and
/// have the function automatically derive the "concrete" pointer type to
/// be used in the kernel. This function does that: depending on whether
/// the `Iterator` is a void pointer or not, it returns either the iterator
/// (assuming that it is already concrete), or returns the pointer casted
/// to the concrete type.
///
/// @tparam T The value type of the pointer, when dereferenced.
/// @tparam Iterator The abstract iterator, can be void* or an actual pointer.
///
/// @param it The iterator to make concrete.
template <typename T, typename Iterator>
__device__ Iterator make_concrete_iterator(Iterator it)
{
return it;
}
template <typename T>
__device__ const T* make_concrete_iterator(const void* it)
{
return reinterpret_cast<const T*>(it);
}
template <typename T>
__device__ const T* make_concrete_iterator(void* it)
{
return reinterpret_cast<const T*>(it);
}
/// @brief Utility to launch persistent kernels.
///
/// This function launches a GPU kernel with a grid size derived from the kernel's
/// occupancy and the total number of multiprocessors on the GPU.
///
/// @tparam Kernel The type of the kernel function.
/// @tparam Args The types of the kernel arguments.
///
/// @param kernel An instance of the kernel function. This should be a __global__ function.
/// @param block_size The kernel's (1D) block size.
/// @param stream The stream to launch the kernel on.
/// @param args The kernel launch arguments.
template <typename Kernel, typename... Args>
void launch_persistent_kernel(Kernel kernel,
int block_size,
hipStream_t stream,
const Args&... args)
{
int occupancy;
hip_check_error(
hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, block_size, 0));
int device;
hip_check_error(hipGetDevice(&device));
int multiprocessors;
hip_check_error(
hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
kernel<<<occupancy * multiprocessors, block_size, 0, stream>>>(args...);
hip_check_error(hipGetLastError());
}
/// @brief Simple block reduce kernel.
///
/// This function reduces all `value`s across a block according to `reduce`. This function
/// is a relatively simple implementation as its primary purpose is to be correct and
/// readable: No special cases are done for warp reductions, and the function allocates
/// its own shared memory. The result is broadcasted to all threads.
///
/// @tparam BlockSize The number of threads in a block.
/// @tparam T The value type to reduce over.
/// @tparam F The reduction functor type.
///
/// @param value This thread's value to reduce over.
/// @param reduce The reduction functor, used to combine two values. Should be associative.
template <int BlockSize, typename T, typename F>
__device__ T block_reduce(const T& value, F reduce)
{
__shared__ T workspace[BlockSize];
workspace[threadIdx.x] = value;
__syncthreads();
for(unsigned int s = BlockSize / 2; s >= 1; s >>= 1)
{
if(threadIdx.x < s)
workspace[threadIdx.x] = reduce(workspace[threadIdx.x], workspace[threadIdx.x + s]);
__syncthreads();
}
return workspace[0];
}
// Device-side result structure for kernel output
// Packed into a single struct to minimize device memory allocations
struct GpuVerifyDeviceResult
{
unsigned long long error_count; // Number of errors found
float max_error; // Maximum error value
int all_zero; // 1 = device result is all zeros, 0 = has non-zero values
/// @brief Return the neutral element of a GpuVerifyDeviceResult
///
/// This function returns the "neutral element", the element which does nothing
/// when reduced with another with `reduce_results`. Good to be used as an
/// initial value.
__host__ __device__ static GpuVerifyDeviceResult identity()
{
GpuVerifyDeviceResult result;
result.error_count = 0; // No errors yet
result.max_error = 0.0f; // No error observed
result.all_zero = 1; // Start assuming all zeros (will be cleared if nonzero found)
return result;
}
};
/// @brief Combine two device verify results.
///
/// This function returns the "combined" version of two GpuVerifyDeviceResult values, which
/// adds the total amount of errors, sets the correct max error, and records whether
/// any of the values had any zeros.
__device__ GpuVerifyDeviceResult reduce_results(const GpuVerifyDeviceResult& a,
const GpuVerifyDeviceResult& b)
{
GpuVerifyDeviceResult result;
result.error_count = a.error_count + b.error_count;
result.max_error = std::max(a.max_error, b.max_error);
result.all_zero = a.all_zero & b.all_zero;
return result;
}
/// @brief Compare individual tensor elements.
///
/// This function is what actually does the comparison between two tensor
/// elements. The function returns a tuple of three elements.
/// - The absolute maximum difference.
/// - If the second value is set to false, it indicates either that the elements are not
/// equal according to the thresholds `rtol` and `atol`, or that either value is not
/// finite (NaN/Infinity). If set to true, the values are considered equal.
/// - If the third value is set to true, it indicates that both elements are bitwise
/// equal to zero.
template <typename T>
__device__ std::tuple<float, bool, bool>
compare_elements(const T& actual, const T& expected, const float rtol, const float atol)
{
static_assert(!std::is_same_v<T, double>, "TODO: implement compare_elements() for double");
const auto o = type_convert<float>(actual);
const auto r = type_convert<float>(expected);
const auto e = std::abs(o - r);
const auto inequal = e > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r);
using Bytes = std::array<std::byte, sizeof(T)>;
const auto o_bytes = *reinterpret_cast<const Bytes*>(&actual);
const auto r_bytes = *reinterpret_cast<const Bytes*>(&expected);
bool all_zero = true;
for(const auto x : o_bytes)
{
if(x != std::byte{0})
all_zero = false;
}
for(const auto x : r_bytes)
{
if(x != std::byte{0})
all_zero = false;
}
return std::make_tuple(e, inequal, all_zero);
}
// GPU verification kernel - compares device result against reference using relative and absolute
// tolerance. Tracks all errors (no early exit) to provide detailed error reporting.
//
// Uses LDS (shared memory) for block-level reduction to minimize atomic contention.
// This reduces atomic operations from O(errors) to O(blocks), providing massive speedup
// when there are many errors.
template <int BlockSize, typename T, typename IteratorA, typename IteratorB>
__global__ __launch_bounds__(BlockSize) //
void gpu_verify_kernel(IteratorA device_result_it,
IteratorB reference_result_it,
float rtol,
float atol,
long long size,
GpuVerifyDeviceResult* result)
{
auto device_result = make_concrete_iterator<T>(device_result_it);
auto reference_result = make_concrete_iterator<T>(reference_result_it);
auto local_result = GpuVerifyDeviceResult::identity();
// Grid-stride loop to handle any tensor size
long long idx = blockIdx.x * BlockSize + threadIdx.x;
long long stride = BlockSize * gridDim.x;
for(long long i = idx; i < size; i += stride)
{
const auto [abs_diff, inequal, bitwise_zero] =
compare_elements(device_result[i], reference_result[i], rtol, atol);
local_result = reduce_results(local_result,
GpuVerifyDeviceResult{
static_cast<uint64_t>(inequal), // error_count
abs_diff, // max_error
bitwise_zero // all_zero
});
}
const auto block_result = block_reduce<BlockSize>(local_result, reduce_results);
// Final reduction of remaining 32 elements in thread 0
if(threadIdx.x == 0)
{
// Single atomic update per block (reduces contention from O(errors) to O(blocks))
if(block_result.error_count > 0)
{
atomicAdd(&result->error_count, block_result.error_count);
atomicMax(&result->max_error, block_result.max_error);
}
if(!block_result.all_zero)
{
// A nonzero was found, so set the global value to false.
// Note: this is a benign race condition; technically a race condition but
// all blocks write the same value, so its fine.
result->all_zero = 0;
}
}
}
// Host-side wrapper for GPU verification with explicit tolerances
// Returns GpuVerifyResult with detailed error information
template <typename T, typename IteratorA, typename IteratorB>
GpuVerifyResult gpu_verify(IteratorA device_result,
IteratorB reference_result,
float rtol,
float atol,
std::size_t size,
hipStream_t stream = nullptr)
{
// Allocate result buffer on device
GpuVerifyDeviceResult* result_dev;
hip_check_error(hipMalloc(&result_dev, sizeof(GpuVerifyDeviceResult)));
// Initialize result struct
auto result_host = GpuVerifyDeviceResult::identity();
hip_check_error(
hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice));
// Launch persistent kernel.
// automatically derive the optimal grid size from the kernel's occupancy and the
// number of multiprocessors.
constexpr int block_size = 256;
const auto kernel = gpu_verify_kernel<block_size, T, IteratorA, IteratorB>;
launch_persistent_kernel(kernel,
block_size,
stream,
device_result,
reference_result,
rtol,
atol,
static_cast<long long>(size),
result_dev);
// Get result
hip_check_error(
hipMemcpy(&result_host, result_dev, sizeof(GpuVerifyDeviceResult), hipMemcpyDeviceToHost));
// Free device memory
hip_check_error(hipFree(result_dev));
// Build and return result struct
GpuVerifyResult result;
result.error_count = result_host.error_count;
result.max_error = result_host.max_error;
result.total = size;
result.all_zero = result_host.all_zero == 1;
return result;
}
// Forward declaration of gpu_reduce_max
template <typename T, typename Iterator>
float gpu_reduce_max(Iterator device_buffer, std::size_t size, hipStream_t stream = nullptr);
// Host-side wrapper for GPU verification with automatic tolerance computation
// Computes max value on GPU, then computes tolerances and verifies
// Returns GpuVerifyResult with detailed error information
template <typename OutDataType,
typename ComputeDataType = OutDataType,
typename AccDataType = ComputeDataType,
typename IteratorA,
typename IteratorB>
GpuVerifyResult gpu_verify(IteratorA device_result,
IteratorB reference_result,
int number_of_accumulations,
std::size_t size,
hipStream_t stream = nullptr)
{
// Compute max absolute value on GPU (only 4 bytes transferred!)
double max_abs_value =
static_cast<double>(gpu_reduce_max<OutDataType>(reference_result, size, stream));
// Compute tolerances based on data types and accumulation count
float rtol = compute_relative_tolerance<ComputeDataType, OutDataType, AccDataType>(
number_of_accumulations);
float atol = 0.0f;
// Only compute absolute tolerance for supported types
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using F32 = float;
if constexpr((std::is_same_v<ComputeDataType, F16> || std::is_same_v<ComputeDataType, BF16> ||
std::is_same_v<ComputeDataType, F32>) &&
(std::is_same_v<OutDataType, F16> || std::is_same_v<OutDataType, BF16> ||
std::is_same_v<OutDataType, F32>) &&
(std::is_same_v<AccDataType, F16> || std::is_same_v<AccDataType, BF16> ||
std::is_same_v<AccDataType, F32>))
{
atol = static_cast<float>(
ck::utils::get_absolute_threshold<ComputeDataType, OutDataType, AccDataType>(
max_abs_value, number_of_accumulations));
}
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "verify: accumulations=" << number_of_accumulations << " rtol = " << rtol
<< " atol=" << atol << std::endl;
}
// Call the explicit tolerance version
return gpu_verify<OutDataType>(device_result, reference_result, rtol, atol, size, stream);
}
// GPU reduction kernel for computing max(abs(data))
// This is an internal kernel called only by gpu_reduce_max() wrapper.
template <int BlockSize, typename T, typename Iterator>
__global__ __launch_bounds__((BlockSize)) //
void gpu_reduce_max_kernel(Iterator it, long long size, float* __restrict__ max_val)
{
auto data = make_concrete_iterator<T>(it);
long long idx = blockIdx.x * BlockSize + threadIdx.x;
long long stride = BlockSize * gridDim.x;
float local_max = 0.0f;
for(long long i = idx; i < size; i += stride)
{
float val = fabsf(type_convert<float>(data[i]));
local_max = fmaxf(local_max, val);
}
const auto block_max = block_reduce<BlockSize>(
local_max, [](const auto& a, const auto& b) { return std::max(a, b); });
if(threadIdx.x == 0)
atomicMax(max_val, block_max);
}
// Host-side wrapper for GPU max reduction
// Computes max(abs(data)) and returns as float
// Only transfers 4 bytes (the final max value) instead of entire tensor
template <typename T, typename Iterator>
float gpu_reduce_max(Iterator device_buffer, std::size_t size, hipStream_t stream)
{
if(size == 0)
{
return 0.0f;
}
// Allocate device memory for result
float* max_dev;
hip_check_error(hipMalloc(&max_dev, sizeof(float)));
// Initialize to zero
float init_val = 0.0f;
hip_check_error(hipMemcpy(max_dev, &init_val, sizeof(float), hipMemcpyHostToDevice));
// Launch persistent kernel.
// automatically derive the optimal grid size from the kernel's occupancy and the
// number of multiprocessors.
constexpr int block_size = 256;
const auto kernel = gpu_reduce_max_kernel<block_size, T, Iterator>;
launch_persistent_kernel(
kernel, block_size, stream, device_buffer, static_cast<long long>(size), max_dev);
// Copy result to host (only 4 bytes!)
float max_host;
hip_check_error(hipMemcpy(&max_host, max_dev, sizeof(float), hipMemcpyDeviceToHost));
// Free device memory
hip_check_error(hipFree(max_dev));
return max_host;
}
} // namespace profiler
} // namespace ck
#if __clang_major__ >= 23
#pragma clang diagnostic pop
#endif