[CK] Integrate GPU reference into ckProfiler for convolutions (#3379)

Refactor and integrate CK GPU references into ckProfiler.
- All convolution layouts and groupings supported for all three directions
- Unit tests verifying GPU and CPU reference is the same
- Support added to profiler (do_verification = 2 enables GPU reference)
- One profiler-based test per direction changed to GPU reference to demonstrate usag

Closes AICK-427
This commit is contained in:
Johannes Graner
2025-12-18 07:59:45 +01:00
committed by GitHub
parent 87dd073887
commit bb8445dca8
31 changed files with 3351 additions and 953 deletions

View File

@@ -131,6 +131,9 @@ template <ck::index_t NDimSpatial,
typename WeiElementOp,
typename OutElementOp,
typename DeviceConvNDFwdInstance,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename ComputeDataType = OutDataType>
bool run_grouped_conv_fwd(int do_verification,
int init_method,
@@ -283,31 +286,25 @@ bool run_grouped_conv_fwd(int do_verification,
DeviceMem out_device_ref_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
out_device_ref_buf.SetZero();
// Extract dimensions using helper function
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
// Launch GPU reference kernel
constexpr ck::index_t block_size = 256;
const ck::long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K;
const ck::index_t grid_size = (output_length + block_size - 1) / block_size;
auto gpu_ref_kernel = ck::ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
WeiDataType,
OutDataType,
ComputeDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
// Call GPU reference with ConvParam directly, using the correct layout types
ck::ref::naive_conv_fwd<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
reinterpret_cast<OutDataType*>(out_device_ref_buf.GetDeviceBuffer()),
dims);
conv_param);
HIP_CHECK_ERROR(hipDeviceSynchronize());
std::cout << "GPU reference kernel completed successfully, copying results..." << std::endl;
std::cout << "GPU reference function completed successfully, copying results..."
<< std::endl;
// Copy GPU reference result to host
out_device_ref_buf.FromDevice(out_host.mData.data());

View File

@@ -12,7 +12,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
{
print_helper_msg();
int do_verification = 1; // 0=no, 1=CPU, 2=GPU
int do_verification = 2; // 0=no, 1=CPU, 2=GPU
int init_method = 1;
bool time_kernel = false;
@@ -71,6 +71,9 @@ bool run_convnd_fwd_example(int argc, char* argv[])
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>,
InLayout,
WeiLayout,
OutLayout,
ComputeDataType>(do_verification,
init_method,
time_kernel,