mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 01:36:06 +00:00
[CK] Integrate GPU reference into ckProfiler for convolutions (#3379)
Refactor and integrate CK GPU references into ckProfiler. - All convolution layouts and groupings supported for all three directions - Unit tests verifying GPU and CPU reference is the same - Support added to profiler (do_verification = 2 enables GPU reference) - One profiler-based test per direction changed to GPU reference to demonstrate usag Closes AICK-427
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -89,8 +90,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
wei_device_buf.ToDevice(wei.mData.data());
|
||||
|
||||
float max_accumulated_value = 0;
|
||||
if(do_verification)
|
||||
if(do_verification == 2)
|
||||
{
|
||||
// Use GPU reference for verification
|
||||
std::cout << "Using GPU reference for verification" << std::endl;
|
||||
|
||||
// Allocate GPU reference output buffer
|
||||
DeviceMem gpu_ref_in_buf(sizeof(InDataType) * in_host.mDesc.GetElementSpaceSize());
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ref::naive_conv_bwd_data<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<InDataType*>(gpu_ref_in_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
conv_param,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
// Copy GPU reference result to host for comparison
|
||||
gpu_ref_in_buf.FromDevice(in_host.mData.data());
|
||||
max_accumulated_value = *std::max_element(in_host.mData.begin(), in_host.mData.end());
|
||||
}
|
||||
else if(do_verification == 1)
|
||||
{
|
||||
// Use CPU reference for verification (default)
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
@@ -93,29 +94,69 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
float max_accumulated_value = 0;
|
||||
if(do_verification)
|
||||
{
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weight_host_result,
|
||||
output,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op,
|
||||
{},
|
||||
{},
|
||||
{});
|
||||
if(do_verification == 1)
|
||||
{
|
||||
// CPU reference
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weight_host_result,
|
||||
output,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op,
|
||||
{},
|
||||
{},
|
||||
{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
else if(do_verification == 2)
|
||||
{
|
||||
// GPU reference
|
||||
std::cout << "Running GPU reference implementation..." << std::endl;
|
||||
|
||||
// Allocate device memory for reference
|
||||
DeviceMem in_ref_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
|
||||
DeviceMem wei_ref_buf(sizeof(WeiDataType) *
|
||||
weight_host_result.mDesc.GetElementSpaceSize());
|
||||
DeviceMem out_ref_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
|
||||
|
||||
in_ref_buf.ToDevice(input.mData.data());
|
||||
out_ref_buf.ToDevice(output.mData.data());
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ck::ref::naive_conv_bwd_weight<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
static_cast<const InDataType*>(in_ref_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_ref_buf.GetDeviceBuffer()),
|
||||
static_cast<const OutDataType*>(out_ref_buf.GetDeviceBuffer()),
|
||||
conv_param,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
// Copy result back to host
|
||||
wei_ref_buf.FromDevice(weight_host_result.mData.data());
|
||||
}
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
max_accumulated_value =
|
||||
*std::max_element(weight_host_result.mData.begin(), weight_host_result.mData.end());
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
@@ -113,8 +114,38 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
wei_device_buf.ToDevice(weight.mData.data());
|
||||
|
||||
// run reference op
|
||||
if(do_verification)
|
||||
if(do_verification == 2)
|
||||
{
|
||||
// Use GPU reference for verification
|
||||
std::cout << "Using GPU reference for verification" << std::endl;
|
||||
|
||||
// Allocate GPU reference output buffer
|
||||
DeviceMem gpu_ref_out_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ref::naive_conv_fwd<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<OutDataType*>(gpu_ref_out_buf.GetDeviceBuffer()),
|
||||
conv_param,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
// Copy GPU reference result to host for comparison
|
||||
gpu_ref_out_buf.FromDevice(host_output.mData.data());
|
||||
}
|
||||
else if(do_verification == 1)
|
||||
{
|
||||
// Use CPU reference for verification (default)
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
|
||||
Reference in New Issue
Block a user