// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #pragma once #include #include #include "ck/utility/data_type.hpp" #include "ck/utility/type_convert.hpp" #include "ck/utility/type.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/hip_check_error.hpp" #include "ck/library/utility/check_err.hpp" namespace ck { namespace profiler { // Result struct for GPU verification with detailed error reporting // Provides backward compatibility via operator bool() struct GpuVerifyResult { unsigned long long error_count; // Number of elements that exceeded tolerance float max_error; // Maximum error value observed std::size_t total; // Total number of elements compared bool all_zero; // True if device result is all zeros (likely kernel issue) // Implicit conversion to bool for backward compatibility // Allows: if (gpu_verify(...)) { ... } operator bool() const { return error_count == 0; } // Calculate error percentage float error_percentage() const { if(total == 0) return 0.0f; return static_cast(error_count) / static_cast(total) * 100.0f; } // Print error summary to stderr (matches check_err format) void print_error_summary() const { if(error_count > 0) { if(all_zero) { std::cerr << "WARNING: Device result is all zeros - kernel may not have executed " "properly!" << std::endl; } std::cerr << "max err: " << max_error; std::cerr << ", number of errors: " << error_count; std::cerr << ", " << std::setprecision(2) << std::fixed << error_percentage() << "% wrong values" << std::endl; } } }; // Compute relative tolerance for GPU verification // Matches the logic of ck::utils::get_relative_threshold but handles all types template inline float compute_relative_tolerance(const int number_of_accumulations = 1) { using F16 = ck::half_t; using BF16 = ck::bhalf_t; using F32 = float; using I8 = int8_t; using I16 = int16_t; using I32 = int32_t; // For integer types, tolerance is 0 if constexpr(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v) { return 0.0f; } // For types supported by get_relative_threshold, use it else if constexpr((std::is_same_v || std::is_same_v || std::is_same_v) && (std::is_same_v || std::is_same_v || std::is_same_v) && (std::is_same_v || std::is_same_v || std::is_same_v)) { return static_cast( ck::utils::get_relative_threshold( number_of_accumulations)); } // For unsupported types (FP8, BF8, etc.), use default tolerances based on output type else { if constexpr(std::is_same_v) { return 1e-3f; } else if constexpr(std::is_same_v) { return 1e-1f; } else { // For FP8/BF8 and other types, use conservative tolerance return 1e-1f; } } } // Device-side result structure for kernel output // Packed into a single struct to minimize device memory allocations struct GpuVerifyDeviceResult { unsigned long long error_count; // Number of errors found float max_error; // Maximum error value int all_zero; // 1 = device result is all zeros, 0 = has non-zero values }; // GPU verification kernel - compares device result against reference using relative and absolute // tolerance. Tracks all errors (no early exit) to provide detailed error reporting. // // Uses LDS (shared memory) for block-level reduction to minimize atomic contention. // This reduces atomic operations from O(errors) to O(blocks), providing massive speedup // when there are many errors. // // Assumption: Block size is 256 template __global__ void gpu_verify_kernel(const T* __restrict__ device_result, const T* __restrict__ reference_result, float rtol, float atol, long long size, GpuVerifyDeviceResult* result) { constexpr int block_size = 256; // Shared memory for block-level reduction __shared__ unsigned long long shared_error_count[block_size]; __shared__ float shared_max_error[block_size]; __shared__ int shared_has_error[block_size]; __shared__ int shared_has_nonzero[block_size]; // Thread-local accumulators (in registers) unsigned long long local_error_count = 0; float local_max_error = 0.0f; int local_has_error = 0; int local_has_nonzero = 0; // Grid-stride loop to handle any tensor size long long idx = blockIdx.x * blockDim.x + threadIdx.x; long long stride = blockDim.x * gridDim.x; for(long long i = idx; i < size; i += stride) { // Convert to float for comparison float dev_val = type_convert(device_result[i]); float ref_val = type_convert(reference_result[i]); // Check if device value is non-zero if(dev_val != 0.0f) { local_has_nonzero = 1; } // Compute absolute difference float abs_diff = fabsf(dev_val - ref_val); // Check tolerance (matches CPU check_err logic: err > atol + rtol * abs(ref)) if(abs_diff > atol + rtol * fabsf(ref_val)) { local_has_error = 1; local_error_count++; local_max_error = fmaxf(local_max_error, abs_diff); } } // Store thread-local results to shared memory shared_error_count[threadIdx.x] = local_error_count; shared_max_error[threadIdx.x] = local_max_error; shared_has_error[threadIdx.x] = local_has_error; shared_has_nonzero[threadIdx.x] = local_has_nonzero; __syncthreads(); // Block-level reduction: 256 -> 128 -> 64 -> 32 for(unsigned int s = block_size / 2; s >= 32; s >>= 1) { if(threadIdx.x < s) { shared_error_count[threadIdx.x] += shared_error_count[threadIdx.x + s]; shared_max_error[threadIdx.x] = fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]); shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s]; shared_has_nonzero[threadIdx.x] |= shared_has_nonzero[threadIdx.x + s]; } __syncthreads(); } // Final reduction of remaining 32 elements in thread 0 if(threadIdx.x == 0) { for(int i = 1; i < 32; ++i) { shared_error_count[0] += shared_error_count[i]; shared_max_error[0] = fmaxf(shared_max_error[0], shared_max_error[i]); shared_has_error[0] |= shared_has_error[i]; shared_has_nonzero[0] |= shared_has_nonzero[i]; } // Single atomic update per block (reduces contention from O(errors) to O(blocks)) if(shared_has_error[0]) { atomicAdd(&result->error_count, shared_error_count[0]); atomicMax(&result->max_error, shared_max_error[0]); } // Update all_zero flag: if no nonzero values found, mark as all zero if(!shared_has_nonzero[0]) { atomicMin(&result->all_zero, 1); } else { atomicMin(&result->all_zero, 0); } } } // Host-side wrapper for GPU verification with explicit tolerances // Returns GpuVerifyResult with detailed error information template GpuVerifyResult gpu_verify(const void* device_result, const void* reference_result, float rtol, float atol, std::size_t size, hipStream_t stream = nullptr) { // Allocate result buffer on device GpuVerifyDeviceResult* result_dev; hip_check_error(hipMalloc(&result_dev, sizeof(GpuVerifyDeviceResult))); // Initialize result struct GpuVerifyDeviceResult result_host; result_host.error_count = 0; // No errors yet result_host.max_error = 0.0f; // No error observed result_host.all_zero = 1; // Start assuming all zeros (will be cleared if nonzero found) hip_check_error( hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice)); // Launch kernel with grid-stride loop // Use 65535 as max grid size (hardware limit for grid dimension in x) // Grid-stride loop handles any tensor size regardless of grid dimensions constexpr int block_size = 256; int grid_size = std::min(65535, (size + block_size - 1) / block_size); gpu_verify_kernel <<>>(static_cast(device_result), static_cast(reference_result), rtol, atol, static_cast(size), result_dev); hip_check_error(hipGetLastError()); // Synchronize the stream to ensure kernel completion before reading results hip_check_error(hipStreamSynchronize(stream)); // Get result hip_check_error( hipMemcpy(&result_host, result_dev, sizeof(GpuVerifyDeviceResult), hipMemcpyDeviceToHost)); // Free device memory hip_check_error(hipFree(result_dev)); // Build and return result struct GpuVerifyResult result; result.error_count = result_host.error_count; result.max_error = result_host.max_error; result.total = size; result.all_zero = (result_host.all_zero == 1); return result; } // Forward declaration of gpu_reduce_max template float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t stream = nullptr); // Host-side wrapper for GPU verification with automatic tolerance computation // Computes max value on GPU, then computes tolerances and verifies // Returns GpuVerifyResult with detailed error information template GpuVerifyResult gpu_verify(const void* device_result, const void* reference_result, int number_of_accumulations, std::size_t size, hipStream_t stream = nullptr) { // Compute max absolute value on GPU (only 4 bytes transferred!) double max_abs_value = static_cast(gpu_reduce_max(reference_result, size, stream)); // Compute tolerances based on data types and accumulation count float rtol = compute_relative_tolerance( number_of_accumulations); float atol = 0.0f; // Only compute absolute tolerance for supported types using F16 = ck::half_t; using BF16 = ck::bhalf_t; using F32 = float; if constexpr((std::is_same_v || std::is_same_v || std::is_same_v) && (std::is_same_v || std::is_same_v || std::is_same_v) && (std::is_same_v || std::is_same_v || std::is_same_v)) { atol = static_cast( ck::utils::get_absolute_threshold( max_abs_value, number_of_accumulations)); } // Call the explicit tolerance version return gpu_verify(device_result, reference_result, rtol, atol, size, stream); } // GPU reduction kernel for computing max(abs(data)) // This is an internal kernel called only by gpu_reduce_max() wrapper. // // Assumption: Block size is 256 template __global__ void gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restrict__ max_val) { constexpr int block_size = 256; __shared__ float shared_max[block_size]; long long idx = blockIdx.x * blockDim.x + threadIdx.x; long long stride = blockDim.x * gridDim.x; float local_max = 0.0f; for(long long i = idx; i < size; i += stride) { float val = fabsf(type_convert(data[i])); local_max = fmaxf(local_max, val); } shared_max[threadIdx.x] = local_max; __syncthreads(); // Block-level reduction: 256 -> 128 -> 64 -> 32 for(unsigned int s = block_size / 2; s >= 32; s >>= 1) { if(threadIdx.x < s) { shared_max[threadIdx.x] = fmaxf(shared_max[threadIdx.x], shared_max[threadIdx.x + s]); } __syncthreads(); } // Final reduction of remaining 32 elements in thread 0 if(threadIdx.x == 0) { for(int i = 1; i < 32; ++i) { shared_max[0] = fmaxf(shared_max[0], shared_max[i]); } // Single atomic update per block atomicMax(max_val, shared_max[0]); } } // Host-side wrapper for GPU max reduction // Computes max(abs(data)) and returns as float // Only transfers 4 bytes (the final max value) instead of entire tensor template float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t stream) { if(size == 0) { return 0.0f; } // Allocate device memory for result float* max_dev; hip_check_error(hipMalloc(&max_dev, sizeof(float))); // Initialize to zero float init_val = 0.0f; hip_check_error(hipMemcpy(max_dev, &init_val, sizeof(float), hipMemcpyHostToDevice)); // Launch reduction kernel // Use 1024 blocks max for reduction to balance occupancy vs. grid-stride iterations // For very large tensors (>256M elements), grid-stride loop handles the remainder constexpr int block_size = 256; int grid_size = std::min(1024, (size + block_size - 1) / block_size); gpu_reduce_max_kernel<<>>( static_cast(device_buffer), static_cast(size), max_dev); hip_check_error(hipGetLastError()); // Synchronize if using default stream if(stream == nullptr) { hip_check_error(hipDeviceSynchronize()); } // Copy result to host (only 4 bytes!) float max_host; hip_check_error(hipMemcpy(&max_host, max_dev, sizeof(float), hipMemcpyDeviceToHost)); // Free device memory hip_check_error(hipFree(max_dev)); return max_host; } } // namespace profiler } // namespace ck