[CK] Integrate GPU reference into ckProfiler for convolutions (#3379)

Refactor and integrate CK GPU references into ckProfiler.
- All convolution layouts and groupings supported for all three directions
- Unit tests verifying GPU and CPU reference is the same
- Support added to profiler (do_verification = 2 enables GPU reference)
- One profiler-based test per direction changed to GPU reference to demonstrate usag

Closes AICK-427
This commit is contained in:
Johannes Graner
2025-12-18 07:59:45 +01:00
committed by GitHub
parent 87dd073887
commit bb8445dca8
31 changed files with 3351 additions and 953 deletions

View File

@@ -18,6 +18,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
namespace ck {
@@ -89,8 +90,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
wei_device_buf.ToDevice(wei.mData.data());
float max_accumulated_value = 0;
if(do_verification)
if(do_verification == 2)
{
// Use GPU reference for verification
std::cout << "Using GPU reference for verification" << std::endl;
// Allocate GPU reference output buffer
DeviceMem gpu_ref_in_buf(sizeof(InDataType) * in_host.mDesc.GetElementSpaceSize());
// Call GPU reference with ConvParam directly
ref::naive_conv_bwd_data<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<InDataType*>(gpu_ref_in_buf.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);
// Copy GPU reference result to host for comparison
gpu_ref_in_buf.FromDevice(in_host.mData.data());
max_accumulated_value = *std::max_element(in_host.mData.begin(), in_host.mData.end());
}
else if(do_verification == 1)
{
// Use CPU reference for verification (default)
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
InDataType,
WeiDataType,

View File

@@ -23,6 +23,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
namespace ck {
namespace profiler {
@@ -93,29 +94,69 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
float max_accumulated_value = 0;
if(do_verification)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weight_host_result,
output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op,
{},
{},
{});
if(do_verification == 1)
{
// CPU reference
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weight_host_result,
output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op,
{},
{},
{});
ref_invoker.Run(ref_argument);
}
else if(do_verification == 2)
{
// GPU reference
std::cout << "Running GPU reference implementation..." << std::endl;
// Allocate device memory for reference
DeviceMem in_ref_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_ref_buf(sizeof(WeiDataType) *
weight_host_result.mDesc.GetElementSpaceSize());
DeviceMem out_ref_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
in_ref_buf.ToDevice(input.mData.data());
out_ref_buf.ToDevice(output.mData.data());
// Call GPU reference with ConvParam directly
ck::ref::naive_conv_bwd_weight<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
static_cast<const InDataType*>(in_ref_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_ref_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(out_ref_buf.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);
// Copy result back to host
wei_ref_buf.FromDevice(weight_host_result.mData.data());
}
ref_invoker.Run(ref_argument);
max_accumulated_value =
*std::max_element(weight_host_result.mData.begin(), weight_host_result.mData.end());
}

View File

@@ -22,6 +22,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
namespace ck {
namespace profiler {
@@ -113,8 +114,38 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
wei_device_buf.ToDevice(weight.mData.data());
// run reference op
if(do_verification)
if(do_verification == 2)
{
// Use GPU reference for verification
std::cout << "Using GPU reference for verification" << std::endl;
// Allocate GPU reference output buffer
DeviceMem gpu_ref_out_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
// Call GPU reference with ConvParam directly
ref::naive_conv_fwd<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
reinterpret_cast<OutDataType*>(gpu_ref_out_buf.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);
// Copy GPU reference result to host for comparison
gpu_ref_out_buf.FromDevice(host_output.mData.data());
}
else if(do_verification == 1)
{
// Use CPU reference for verification (default)
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,