Merge commit 'f173642087ed6034a0ac16188de2f36f4c008945' into develop

This commit is contained in:
assistant-librarian[bot]
2026-01-14 15:15:11 +00:00
parent 41bf6ecb5a
commit 7501af9cc6
5 changed files with 328 additions and 222 deletions

View File

@@ -3,6 +3,9 @@
#pragma once
#include <iomanip>
#include <iostream>
#include "ck/utility/data_type.hpp"
#include "ck/utility/type_convert.hpp"
#include "ck/utility/type.hpp"
@@ -13,6 +16,46 @@
namespace ck {
namespace profiler {
// Result struct for GPU verification with detailed error reporting
// Provides backward compatibility via operator bool()
struct GpuVerifyResult
{
unsigned long long error_count; // Number of elements that exceeded tolerance
float max_error; // Maximum error value observed
std::size_t total; // Total number of elements compared
bool all_zero; // True if device result is all zeros (likely kernel issue)
// Implicit conversion to bool for backward compatibility
// Allows: if (gpu_verify(...)) { ... }
operator bool() const { return error_count == 0; }
// Calculate error percentage
float error_percentage() const
{
if(total == 0)
return 0.0f;
return static_cast<float>(error_count) / static_cast<float>(total) * 100.0f;
}
// Print error summary to stderr (matches check_err format)
void print_error_summary() const
{
if(error_count > 0)
{
if(all_zero)
{
std::cerr << "WARNING: Device result is all zeros - kernel may not have executed "
"properly!"
<< std::endl;
}
std::cerr << "max err: " << max_error;
std::cerr << ", number of errors: " << error_count;
std::cerr << ", " << std::setprecision(2) << std::fixed << error_percentage()
<< "% wrong values" << std::endl;
}
}
};
// Compute relative tolerance for GPU verification
// Matches the logic of ck::utils::get_relative_threshold but handles all types
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
@@ -63,16 +106,45 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1)
}
}
// Device-side result structure for kernel output
// Packed into a single struct to minimize device memory allocations
struct GpuVerifyDeviceResult
{
unsigned long long error_count; // Number of errors found
float max_error; // Maximum error value
int all_zero; // 1 = device result is all zeros, 0 = has non-zero values
};
// GPU verification kernel - compares device result against reference using relative and absolute
// tolerance Returns 1 in passed if all elements match within tolerance, 0 otherwise
// tolerance. Tracks all errors (no early exit) to provide detailed error reporting.
//
// Uses LDS (shared memory) for block-level reduction to minimize atomic contention.
// This reduces atomic operations from O(errors) to O(blocks), providing massive speedup
// when there are many errors.
//
// Assumption: Block size is 256
template <typename T>
__global__ void gpu_verify_kernel(const T* __restrict__ device_result,
const T* __restrict__ reference_result,
float rtol,
float atol,
long long size,
int* passed)
GpuVerifyDeviceResult* result)
{
constexpr int block_size = 256;
// Shared memory for block-level reduction
__shared__ unsigned long long shared_error_count[block_size];
__shared__ float shared_max_error[block_size];
__shared__ int shared_has_error[block_size];
__shared__ int shared_has_nonzero[block_size];
// Thread-local accumulators (in registers)
unsigned long long local_error_count = 0;
float local_max_error = 0.0f;
int local_has_error = 0;
int local_has_nonzero = 0;
// Grid-stride loop to handle any tensor size
long long idx = blockIdx.x * blockDim.x + threadIdx.x;
long long stride = blockDim.x * gridDim.x;
@@ -83,35 +155,95 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
float dev_val = type_convert<float>(device_result[i]);
float ref_val = type_convert<float>(reference_result[i]);
// Check if device value is non-zero
if(dev_val != 0.0f)
{
local_has_nonzero = 1;
}
// Compute absolute difference
float abs_diff = fabsf(dev_val - ref_val);
// Check tolerance (matches CPU check_err logic: err > atol + rtol * abs(ref))
if(abs_diff > atol + rtol * fabsf(ref_val))
{
atomicMin(passed, 0); // Mark as failed
return; // Early exit on first failure
local_has_error = 1;
local_error_count++;
local_max_error = fmaxf(local_max_error, abs_diff);
}
}
// Store thread-local results to shared memory
shared_error_count[threadIdx.x] = local_error_count;
shared_max_error[threadIdx.x] = local_max_error;
shared_has_error[threadIdx.x] = local_has_error;
shared_has_nonzero[threadIdx.x] = local_has_nonzero;
__syncthreads();
// Block-level reduction: 256 -> 128 -> 64 -> 32
for(unsigned int s = block_size / 2; s >= 32; s >>= 1)
{
if(threadIdx.x < s)
{
shared_error_count[threadIdx.x] += shared_error_count[threadIdx.x + s];
shared_max_error[threadIdx.x] =
fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]);
shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s];
shared_has_nonzero[threadIdx.x] |= shared_has_nonzero[threadIdx.x + s];
}
__syncthreads();
}
// Final reduction of remaining 32 elements in thread 0
if(threadIdx.x == 0)
{
for(int i = 1; i < 32; ++i)
{
shared_error_count[0] += shared_error_count[i];
shared_max_error[0] = fmaxf(shared_max_error[0], shared_max_error[i]);
shared_has_error[0] |= shared_has_error[i];
shared_has_nonzero[0] |= shared_has_nonzero[i];
}
// Single atomic update per block (reduces contention from O(errors) to O(blocks))
if(shared_has_error[0])
{
atomicAdd(&result->error_count, shared_error_count[0]);
atomicMax(&result->max_error, shared_max_error[0]);
}
// Update all_zero flag: if no nonzero values found, mark as all zero
if(!shared_has_nonzero[0])
{
atomicMin(&result->all_zero, 1);
}
else
{
atomicMin(&result->all_zero, 0);
}
}
}
// Host-side wrapper for GPU verification with explicit tolerances
// Returns true if verification passed, false otherwise
// Returns GpuVerifyResult with detailed error information
template <typename T>
bool gpu_verify(const void* device_result,
const void* reference_result,
float rtol,
float atol,
std::size_t size,
hipStream_t stream = nullptr)
GpuVerifyResult gpu_verify(const void* device_result,
const void* reference_result,
float rtol,
float atol,
std::size_t size,
hipStream_t stream = nullptr)
{
// Allocate result buffer on device
int* passed_dev;
hip_check_error(hipMalloc(&passed_dev, sizeof(int)));
GpuVerifyDeviceResult* result_dev;
hip_check_error(hipMalloc(&result_dev, sizeof(GpuVerifyDeviceResult)));
// Initialize to passed (1)
int passed_host = 1;
hip_check_error(hipMemcpy(passed_dev, &passed_host, sizeof(int), hipMemcpyHostToDevice));
// Initialize result struct
GpuVerifyDeviceResult result_host;
result_host.error_count = 0; // No errors yet
result_host.max_error = 0.0f; // No error observed
result_host.all_zero = 1; // Start assuming all zeros (will be cleared if nonzero found)
hip_check_error(
hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice));
// Launch kernel with grid-stride loop
// Use 65535 as max grid size (hardware limit for grid dimension in x)
@@ -125,7 +257,7 @@ bool gpu_verify(const void* device_result,
rtol,
atol,
static_cast<long long>(size),
passed_dev);
result_dev);
hip_check_error(hipGetLastError());
@@ -133,12 +265,20 @@ bool gpu_verify(const void* device_result,
hip_check_error(hipStreamSynchronize(stream));
// Get result
hip_check_error(hipMemcpy(&passed_host, passed_dev, sizeof(int), hipMemcpyDeviceToHost));
hip_check_error(
hipMemcpy(&result_host, result_dev, sizeof(GpuVerifyDeviceResult), hipMemcpyDeviceToHost));
// Free device memory
hip_check_error(hipFree(passed_dev));
hip_check_error(hipFree(result_dev));
return passed_host == 1;
// Build and return result struct
GpuVerifyResult result;
result.error_count = result_host.error_count;
result.max_error = result_host.max_error;
result.total = size;
result.all_zero = (result_host.all_zero == 1);
return result;
}
// Forward declaration of gpu_reduce_max
@@ -147,15 +287,15 @@ float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t st
// Host-side wrapper for GPU verification with automatic tolerance computation
// Computes max value on GPU, then computes tolerances and verifies
// Returns true if verification passed, false otherwise
// Returns GpuVerifyResult with detailed error information
template <typename OutDataType,
typename ComputeDataType = OutDataType,
typename AccDataType = ComputeDataType>
bool gpu_verify(const void* device_result,
const void* reference_result,
int number_of_accumulations,
std::size_t size,
hipStream_t stream = nullptr)
GpuVerifyResult gpu_verify(const void* device_result,
const void* reference_result,
int number_of_accumulations,
std::size_t size,
hipStream_t stream = nullptr)
{
// Compute max absolute value on GPU (only 4 bytes transferred!)
double max_abs_value =
@@ -187,24 +327,6 @@ bool gpu_verify(const void* device_result,
return gpu_verify<OutDataType>(device_result, reference_result, rtol, atol, size, stream);
}
//
// Helper function for atomic float max (using compare-and-swap)
__device__ __forceinline__ float atomicMaxFloat(float* address, float val)
{
int* address_as_int = reinterpret_cast<int*>(address);
int old = *address_as_int;
int assumed;
do
{
assumed = old;
old =
atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed))));
} while(assumed != old);
return __int_as_float(old);
}
// GPU reduction kernel for computing max(abs(data))
// This is an internal kernel called only by gpu_reduce_max() wrapper.
//
@@ -231,7 +353,7 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
__syncthreads();
// Block-level reduction: 256 -> 128 -> 64 -> 32
for(unsigned int s = block_size / 2; s > 32; s >>= 1)
for(unsigned int s = block_size / 2; s >= 32; s >>= 1)
{
if(threadIdx.x < s)
{
@@ -240,26 +362,16 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
__syncthreads();
}
// Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1
// No sync needed within a warp
if(threadIdx.x < 32)
{
volatile float* smem = shared_max;
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 32]);
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 16]);
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 8]);
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 4]);
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 2]);
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 1]);
}
// Two-phase reduction pattern minimizes atomic contention:
// 1. Each block reduces to shared memory (above)
// 2. Single thread per block updates global max (below)
// This limits atomic operations to O(grid_size) rather than O(total_threads)
// Final reduction of remaining 32 elements in thread 0
if(threadIdx.x == 0)
{
atomicMaxFloat(max_val, shared_max[0]);
for(int i = 1; i < 32; ++i)
{
shared_max[0] = fmaxf(shared_max[0], shared_max[i]);
}
// Single atomic update per block
atomicMax(max_val, shared_max[0]);
}
}

View File

@@ -20,7 +20,7 @@
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
#include "profiler/gpu_verification.hpp"
#include "ck/library/utility/gpu_verification.hpp"
namespace ck {
namespace profiler {
@@ -58,37 +58,63 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
std::cout << "out: " << out_g_n_k_wos_desc << std::endl;
std::cout << "wei: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "in: " << in_g_n_c_wis_desc << std::endl;
// Get element space sizes
const auto out_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize();
const auto wei_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
const auto in_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize();
// Allocate GPU buffers
DeviceMem out_device_buf(sizeof(OutDataType) * out_element_space_size);
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size);
DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size);
// Generate data directly on GPU using DeviceMem methods
switch(init_method)
{
case 0:
// Zero initialization
out_device_buf.SetZero();
wei_device_buf.SetZero();
break;
case 1:
// Discrete integer values in range [-5, 5]
out_device_buf.FillUniformRandInteger<OutDataType>(-5, 5);
wei_device_buf.FillUniformRandInteger<WeiDataType>(-5, 5);
break;
case 2:
// Continuous float values
out_device_buf.FillUniformRandFp<OutDataType>(0.0f, 1.0f);
wei_device_buf.FillUniformRandFp<WeiDataType>(-0.5f, 0.5f);
break;
default:
// Constant value 1
out_device_buf.SetValue<OutDataType>(ck::type_convert<OutDataType>(1));
wei_device_buf.SetValue<WeiDataType>(ck::type_convert<WeiDataType>(1));
}
// Create host tensors (needed only for verification)
Tensor<OutDataType> out(out_g_n_k_wos_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<InDataType> in_host(in_g_n_c_wis_desc);
Tensor<InDataType> in_device(in_g_n_c_wis_desc);
std::cout << "out: " << out.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "in: " << in_host.mDesc << std::endl;
switch(init_method)
// Copy GPU→CPU only if verification is enabled
if(do_verification == 1 || do_verification == 2)
{
case 0: break;
case 1:
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
case 2:
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
break;
default:
out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
out_device_buf.FromDevice(out.mData.data());
wei_device_buf.FromDevice(wei.mData.data());
}
DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
out_device_buf.ToDevice(out.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
// Copy to host only if CPU verification is needed
if(do_verification == 1)
{
out_device_buf.FromDevice(out.mData.data());
wei_device_buf.FromDevice(wei.mData.data());
}
// Allocate GPU reference buffer (used only if do_verification == 2)
DeviceMem gpu_ref_in_buf(
@@ -237,52 +263,24 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
// Perform GPU verification (max value computed internally on GPU)
const std::size_t tensor_size = in_device.mDesc.GetElementSpaceSize();
bool gpu_passed = ck::profiler::gpu_verify<InDataType, ComputeType, AccDataType>(
auto gpu_result = ck::profiler::gpu_verify<InDataType, ComputeType, AccDataType>(
in_device_buf.GetDeviceBuffer(),
gpu_ref_in_buf.GetDeviceBuffer(),
total_accums,
tensor_size);
if(!gpu_passed)
if(!gpu_result)
{
// GPU verification failed - fall back to CPU for detailed diagnostics
std::cout << "GPU verification failed, running CPU verification for details..."
<< std::endl;
// Copy both buffers to host
in_device_buf.FromDevice(in_device.mData.data());
gpu_ref_in_buf.FromDevice(in_host.mData.data());
// Recalculate tolerances for CPU verification with original logic
auto rtol =
ck::utils::get_relative_threshold<ComputeType, InDataType, AccDataType>(
num_accums);
auto atol =
ck::utils::get_absolute_threshold<ComputeType, InDataType, AccDataType>(
max_accumulated_value / split_k_for_run, num_accums);
if(split_k_for_run > 1)
{
auto rtol_split_k =
ck::utils::get_relative_threshold<InDataType, InDataType, InDataType>(
split_k_for_run);
auto atol_split_k =
ck::utils::get_absolute_threshold<InDataType, InDataType, InDataType>(
max_accumulated_value, split_k_for_run);
rtol = std::max(rtol, rtol_split_k);
atol = std::max(atol, atol_split_k);
}
// Run CPU verification for detailed error messages
ck::utils::check_err(
in_device, in_host, "Error: Incorrect results!", rtol, atol);
// GPU verification failed - print detailed error summary
gpu_result.print_error_summary();
pass = false;
std::cout << "Relative error threshold: " << rtol
<< " Absolute error threshold: " << atol << std::endl;
if(do_log)
{
// Copy buffers to host for logging
in_device_buf.FromDevice(in_device.mData.data());
gpu_ref_in_buf.FromDevice(in_host.mData.data());
LogRangeAsType<float>(std::cout << "output : ", out.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "weight: ", wei.mData, ",") << std::endl;

View File

@@ -24,7 +24,7 @@
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
#include "profiler/gpu_verification.hpp"
#include "ck/library/utility/gpu_verification.hpp"
namespace ck {
namespace profiler {
@@ -63,35 +63,52 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
std::cout << "input: " << in_g_n_c_wis_desc << std::endl;
std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "output: " << out_g_n_k_wos_desc << std::endl;
// Get element space sizes
const auto input_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize();
const auto weight_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
const auto output_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize();
// Allocate GPU buffers
DeviceMem in_device_buf(sizeof(InDataType) * input_element_space_size);
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_element_space_size);
DeviceMem out_device_buf(sizeof(OutDataType) * output_element_space_size);
// Generate data directly on GPU using DeviceMem methods
switch(init_method)
{
case 0:
// Zero initialization
in_device_buf.SetZero();
out_device_buf.SetZero();
break;
case 1:
// Discrete integer values in range [-5, 5]
in_device_buf.FillUniformRandInteger<InDataType>(-5, 5);
out_device_buf.FillUniformRandInteger<OutDataType>(-5, 5);
break;
default:
// Continuous float values
in_device_buf.FillUniformRandFp<InDataType>(0.0f, 1.0f);
out_device_buf.FillUniformRandFp<OutDataType>(-0.5f, 0.5f);
}
// Create host tensors (needed only for verification)
Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight_host_result(wei_g_k_c_xs_desc);
Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
Tensor<OutDataType> output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight_host_result.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl;
switch(init_method)
// Copy to host only if CPU verification is needed
if(do_verification == 1)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
in_device_buf.FromDevice(input.mData.data());
out_device_buf.FromDevice(output.mData.data());
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) *
weight_device_result.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(input.mData.data());
out_device_buf.ToDevice(output.mData.data());
// Allocate GPU reference buffer (used only if do_verification == 2)
DeviceMem gpu_ref_wei_buf(
do_verification == 2 ? sizeof(WeiDataType) * weight_host_result.mDesc.GetElementSpaceSize()
@@ -343,63 +360,28 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
// Perform GPU verification (max value computed internally on GPU)
const std::size_t tensor_size =
weight_device_result.mDesc.GetElementSpaceSize();
bool gpu_passed =
auto gpu_result =
ck::profiler::gpu_verify<WeiDataType, ComputeType, AccDataType>(
wei_device_buf.GetDeviceBuffer(),
gpu_ref_wei_buf.GetDeviceBuffer(),
total_accums,
tensor_size);
if(!gpu_passed)
if(!gpu_result)
{
// GPU verification failed - fall back to CPU for detailed diagnostics
std::cout
<< "GPU verification failed, running CPU verification for details..."
<< std::endl;
// Copy both buffers to host
wei_device_buf.FromDevice(weight_device_result.mData.data());
gpu_ref_wei_buf.FromDevice(weight_host_result.mData.data());
// Recalculate tolerances for CPU verification with original logic
const index_t num_accums_full = output.GetElementSize() / conv_param.K_;
const index_t num_accums_split_k = split_k_value;
auto rtol = ck::utils::
get_relative_threshold<ComputeType, WeiDataType, AccDataType>(
num_accums_full / num_accums_split_k);
auto atol = ck::utils::
get_absolute_threshold<ComputeType, WeiDataType, AccDataType>(
max_accumulated_value / num_accums_split_k,
num_accums_full / num_accums_split_k);
if(split_k_value > 1)
{
auto rtol_split_k =
ck::utils::get_relative_threshold<WeiDataType,
WeiDataType,
WeiDataType>(num_accums_split_k);
auto atol_split_k = ck::utils::
get_absolute_threshold<WeiDataType, WeiDataType, WeiDataType>(
max_accumulated_value, num_accums_split_k);
rtol = std::max(rtol, rtol_split_k);
atol = std::max(atol, atol_split_k);
}
// Run CPU verification for detailed error messages
ck::utils::check_err(weight_device_result,
weight_host_result,
"Error: Incorrect results!",
rtol,
atol);
// GPU verification failed - print detailed error summary
gpu_result.print_error_summary();
all_pass = false;
std::cout << "Relative error threshold: " << rtol
<< " Absolute error threshold: " << atol << std::endl;
std::cout << "Fail info: splitK: " << split_k_value << " "
<< op_ptr->GetTypeString() << std::endl;
if(do_log)
{
// Copy buffers to host for logging
wei_device_buf.FromDevice(weight_device_result.mData.data());
gpu_ref_wei_buf.FromDevice(weight_host_result.mData.data());
LogRangeAsType<float>(std::cout << "output : ", output.mData, ",")
<< std::endl;
LogRangeAsType<float>(

View File

@@ -23,7 +23,7 @@
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
#include "profiler/gpu_verification.hpp"
#include "ck/library/utility/gpu_verification.hpp"
namespace ck {
namespace profiler {
@@ -86,34 +86,52 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
// Get element space sizes for GPU allocation
const auto input_size = in_g_n_c_wis_desc.GetElementSpaceSize();
const auto weight_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
const auto output_size = out_g_n_k_wos_desc.GetElementSpaceSize();
std::cout << "input: " << in_g_n_c_wis_desc << std::endl;
std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "output: " << out_g_n_k_wos_desc << std::endl;
// Allocate GPU memory first (GPU-first workflow)
DeviceMem in_device_buf(sizeof(InDataType) * input_size);
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size);
DeviceMem out_device_buf(sizeof(OutDataType) * output_size);
// Generate data directly on GPU using DeviceMem methods
switch(init_method)
{
case 0:
// Zero initialization
in_device_buf.SetZero();
wei_device_buf.SetZero();
break;
case 1:
// Discrete integer generation: {-5, -4, -3, ..., 3, 4}
in_device_buf.FillUniformRandInteger<InDataType>(-5, 5);
wei_device_buf.FillUniformRandInteger<WeiDataType>(-5, 5);
break;
default:
// Continuous float generation
in_device_buf.FillUniformRandFp<InDataType>(0.0f, 1.0f);
wei_device_buf.FillUniformRandFp<WeiDataType>(-0.5f, 0.5f);
}
// Create host tensors (for verification if needed)
Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
// Copy to host only if CPU verification is needed
if(do_verification == 1)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
in_device_buf.FromDevice(input.mData.data());
wei_device_buf.FromDevice(weight.mData.data());
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weight.mData.data());
// Allocate GPU reference buffer (used only if do_verification == 2)
DeviceMem gpu_ref_out_buf(
do_verification == 2 ? sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize() : 0);
@@ -243,28 +261,24 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
// Perform GPU verification (max value computed internally on GPU)
const std::size_t tensor_size = device_output.mDesc.GetElementSpaceSize();
bool gpu_passed = ck::profiler::gpu_verify<OutDataType, AComputeType, OutDataType>(
auto gpu_result = ck::profiler::gpu_verify<OutDataType, AComputeType, OutDataType>(
out_device_buf.GetDeviceBuffer(),
gpu_ref_out_buf.GetDeviceBuffer(),
num_accums,
tensor_size);
if(!gpu_passed)
if(!gpu_result)
{
// GPU verification failed - fall back to CPU for detailed diagnostics
std::cout << "GPU verification failed, running CPU verification for details..."
<< std::endl;
// Copy both buffers to host
out_device_buf.FromDevice(device_output.mData.data());
gpu_ref_out_buf.FromDevice(host_output.mData.data());
// Run CPU verification for detailed error messages
ck::utils::check_err(device_output, host_output);
// GPU verification failed - print detailed error summary
gpu_result.print_error_summary();
pass = false;
if(do_log)
{
// Copy buffers to host for logging
out_device_buf.FromDevice(device_output.mData.data());
gpu_ref_out_buf.FromDevice(host_output.mData.data());
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",")

View File

@@ -13,8 +13,8 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/gpu_verification.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
#include "profiler/gpu_verification.hpp"
using namespace ck::profiler;
using ck::ref::SimpleDeviceMem;