Merge commit 'bb8445dca8a43fe37b9dd35c04bda98d33115399' into develop

This commit is contained in:
assistant-librarian[bot]
2025-12-18 07:15:19 +00:00
parent 334ae1c494
commit ba29aebebd
31 changed files with 3351 additions and 953 deletions

View File

@@ -131,6 +131,9 @@ template <ck::index_t NDimSpatial,
typename WeiElementOp,
typename OutElementOp,
typename DeviceConvNDFwdInstance,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename ComputeDataType = OutDataType>
bool run_grouped_conv_fwd(int do_verification,
int init_method,
@@ -283,31 +286,25 @@ bool run_grouped_conv_fwd(int do_verification,
DeviceMem out_device_ref_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
out_device_ref_buf.SetZero();
// Extract dimensions using helper function
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
// Launch GPU reference kernel
constexpr ck::index_t block_size = 256;
const ck::long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K;
const ck::index_t grid_size = (output_length + block_size - 1) / block_size;
auto gpu_ref_kernel = ck::ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
WeiDataType,
OutDataType,
ComputeDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
// Call GPU reference with ConvParam directly, using the correct layout types
ck::ref::naive_conv_fwd<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
reinterpret_cast<OutDataType*>(out_device_ref_buf.GetDeviceBuffer()),
dims);
conv_param);
HIP_CHECK_ERROR(hipDeviceSynchronize());
std::cout << "GPU reference kernel completed successfully, copying results..." << std::endl;
std::cout << "GPU reference function completed successfully, copying results..."
<< std::endl;
// Copy GPU reference result to host
out_device_ref_buf.FromDevice(out_host.mData.data());

View File

@@ -12,7 +12,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
{
print_helper_msg();
int do_verification = 1; // 0=no, 1=CPU, 2=GPU
int do_verification = 2; // 0=no, 1=CPU, 2=GPU
int init_method = 1;
bool time_kernel = false;
@@ -71,6 +71,9 @@ bool run_convnd_fwd_example(int argc, char* argv[])
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>,
InLayout,
WeiLayout,
OutLayout,
ComputeDataType>(do_verification,
init_method,
time_kernel,

View File

@@ -18,7 +18,8 @@
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/host_utility/hip_check_error.hpp"
using ::ck::DeviceMem;
using ::ck::HostTensorDescriptor;
@@ -81,7 +82,10 @@ template <ck::index_t NDimSpatial,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp,
typename DeviceConvNdBwdDataInstance>
typename DeviceConvNdBwdDataInstance,
typename InLayout,
typename WeiLayout,
typename OutLayout>
int run_conv_bwd_data(int do_verification,
int init_method,
bool time_kernel,
@@ -225,50 +229,52 @@ int run_conv_bwd_data(int do_verification,
}
else if(do_verification == 2)
{
// GPU verification
// GPU verification using naive GPU reference
std::cout << "Running GPU verification..." << std::endl;
// Allocate and ZERO GPU memory for reference input
DeviceMem in_device_ref_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
in_device_ref_buf.SetZero();
// Extract dimensions using helper function
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
constexpr ck::index_t block_size = 256;
const ck::long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C;
const ck::index_t grid_size = (input_length + block_size - 1) / block_size;
auto gpu_ref_kernel = ck::ref::naive_conv_bwd_data_ndhwc_kzyxc_ndhwk<InDataType,
WeiDataType,
OutDataType,
float,
InElementOp,
WeiElementOp,
OutElementOp>;
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
// Call GPU reference with ConvParam directly, using the correct layout types
ck::ref::naive_conv_bwd_data<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<InDataType*>(in_device_ref_buf.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
dims);
conv_param,
in_element_op,
wei_element_op,
out_element_op);
HIP_CHECK_ERROR(hipDeviceSynchronize());
std::cout << "GPU reference kernel completed, copying results..." << std::endl;
std::cout << "GPU reference function completed successfully, copying results..."
<< std::endl;
// Copy GPU reference result
// Copy GPU reference result to host
Tensor<InDataType> in_gpu_ref(in_host.mDesc);
in_device_ref_buf.FromDevice(in_gpu_ref.mData.data());
// Copy optimized kernel result
// Copy GPU kernel result to host
in_device_buf.FromDevice(in_device.mData.data());
std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl;
// Compare: Optimized kernel result vs GPU reference result
bool pass = ck::utils::check_err(in_device,
in_gpu_ref,
"Error: Incorrect results!",
get_rtol<InDataType, float>(),
get_atol<InDataType, float>());
std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
return pass ? 0 : 1;

View File

@@ -92,16 +92,19 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNdBwdDataInstance<1>>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
DeviceConvNdBwdDataInstance<1>,
InLayout,
WeiLayout,
OutLayout>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
@@ -128,16 +131,19 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNdBwdDataInstance<2>>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
DeviceConvNdBwdDataInstance<2>,
InLayout,
WeiLayout,
OutLayout>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
@@ -164,16 +170,19 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNdBwdDataInstance<3>>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
DeviceConvNdBwdDataInstance<3>,
InLayout,
WeiLayout,
OutLayout>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;

View File

@@ -119,16 +119,19 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNdBwdDataInstance<1>>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
DeviceConvNdBwdDataInstance<1>,
InLayout,
WeiLayout,
OutLayout>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
@@ -155,16 +158,19 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNdBwdDataInstance<2>>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
DeviceConvNdBwdDataInstance<2>,
InLayout,
WeiLayout,
OutLayout>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
@@ -191,16 +197,19 @@ int main(int argc, char* argv[])
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNdBwdDataInstance<3>>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
DeviceConvNdBwdDataInstance<3>,
InLayout,
WeiLayout,
OutLayout>(do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;

View File

@@ -149,55 +149,53 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
}
else if(config.do_verification == 2)
{
// GPU verification (only supports G=1, standard convolution)
if(conv_param.G_ != 1)
{
std::cout << "GPU verification only supports G=1 (standard convolution)" << std::endl;
std::cout << "Current G=" << conv_param.G_ << " not supported." << std::endl;
std::cout << "Use do_verification=1 for CPU verification with grouped convolution."
<< std::endl;
return true;
}
std::cout << "Running GPU verification (G=1)..." << std::endl;
// GPU verification using naive GPU reference
std::cout << "Running GPU verification..." << std::endl;
// Allocate and ZERO GPU memory for reference weights
DeviceMem wei_device_ref_buf(sizeof(WeiDataType) *
wei_device_result.mDesc.GetElementSpaceSize());
wei_device_ref_buf.SetZero();
// Extract dimensions using helper function (G=1, standard convolution)
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial, false);
// Call GPU reference function with ConvParam and layout types
using InLayout = InputLayout<NDimSpatial>;
using WeiLayout = WeightLayout<NDimSpatial>;
using OutLayout = OutputLayout<NDimSpatial>;
constexpr ck::index_t block_size = 256;
const ck::long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C;
const ck::index_t grid_size = (weight_length + block_size - 1) / block_size;
auto gpu_ref_kernel = ck::ref::naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk<InDataType,
WeiDataType,
OutDataType,
float,
InElementOp,
WeiElementOp,
OutElementOp>;
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
ck::ref::naive_conv_bwd_weight<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
reinterpret_cast<WeiDataType*>(wei_device_ref_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
dims);
conv_param);
HIP_CHECK_ERROR(hipDeviceSynchronize());
std::cout << "GPU reference kernel completed, copying results..." << std::endl;
std::cout << "GPU reference function completed successfully, copying results..."
<< std::endl;
// Copy GPU reference result to host
wei_device_ref_buf.FromDevice(wei_host_result.mData.data());
// Copy GPU kernel result to host
wei_device_buf.FromDevice(wei_device_result.mData.data());
std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl;
// Compare: Optimized kernel result vs GPU reference result
bool pass = ck::utils::check_err(wei_device_result.mData,
wei_host_result.mData,
"Error: Incorrect results!",
get_rtol<WeiDataType, float>(),
get_atol<WeiDataType, float>());
std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
return pass;

View File

@@ -1,353 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// Standalone test program for Old CK GPU references
// Tests naive_conv_fwd (existing) and future backward ops
#include <iostream>
#include <vector>
#include <numeric>
#include <algorithm>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
// CPU reference for validation
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
// GPU reference (OLD CK - already exists!)
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
using namespace ck;
template <index_t NDimSpatial>
struct ConvParams
{
index_t N, K, C;
std::vector<index_t> input_spatial;
std::vector<index_t> filter_spatial;
std::vector<index_t> output_spatial;
std::vector<index_t> strides;
std::vector<index_t> dilations;
std::vector<index_t> pads;
};
template <index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType>
bool test_conv_forward_gpu_ref(const ConvParams<NDimSpatial>& params, const std::string& test_name)
{
std::cout << "[TEST] " << test_name << std::endl;
// Calculate dimensions
const index_t N = params.N;
const index_t K = params.K;
const index_t C = params.C;
// Create tensor descriptors (NDHWC layout for old CK)
std::vector<index_t> in_lengths = {N};
for(auto d : params.input_spatial)
in_lengths.push_back(d);
in_lengths.push_back(C);
std::vector<index_t> wei_lengths = {K};
for(auto d : params.filter_spatial)
wei_lengths.push_back(d);
wei_lengths.push_back(C);
std::vector<index_t> out_lengths = {N};
for(auto d : params.output_spatial)
out_lengths.push_back(d);
out_lengths.push_back(K);
// Create host tensors
Tensor<InDataType> input(in_lengths);
Tensor<WeiDataType> weight(wei_lengths);
Tensor<OutDataType> output_gpu(out_lengths);
Tensor<OutDataType> output_ref(out_lengths);
// Initialize with random data
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
// Allocate device memory
DeviceMem input_dev(input.mData.size() * sizeof(InDataType));
DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType));
DeviceMem output_dev(output_gpu.mData.size() * sizeof(OutDataType));
// Copy to device
input_dev.ToDevice(input.mData.data());
weight_dev.ToDevice(weight.mData.data());
// Run CPU reference for validation
auto ref_conv =
tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
tensor_operation::element_wise::PassThrough,
tensor_operation::element_wise::PassThrough,
tensor_operation::element_wise::PassThrough>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_arg = ref_conv.MakeArgument(input.mData.data(),
weight.mData.data(),
output_ref.mData.data(),
N,
K,
C,
params.input_spatial,
params.filter_spatial,
params.output_spatial,
params.strides,
params.dilations,
params.pads,
params.pads,
{},
{},
{});
ref_invoker.Run(ref_arg);
// Run GPU reference (OLD CK)
using InElementOp = tensor_operation::element_wise::PassThrough;
using WeiElementOp = tensor_operation::element_wise::PassThrough;
using OutElementOp = tensor_operation::element_wise::PassThrough;
constexpr index_t block_size = 256;
// Extract dimensions based on NDimSpatial
index_t Di = 1, Hi = 1, Wi = 1;
index_t Z = 1, Y = 1, X = 1;
index_t Do = 1, Ho = 1, Wo = 1;
index_t stride_z = 1, stride_y = 1, stride_x = 1;
index_t dilation_z = 1, dilation_y = 1, dilation_x = 1;
index_t pad_z = 0, pad_y = 0, pad_x = 0;
if(NDimSpatial == 1)
{
Wi = params.input_spatial[0];
X = params.filter_spatial[0];
Wo = params.output_spatial[0];
stride_x = params.strides[0];
dilation_x = params.dilations[0];
pad_x = params.pads[0];
}
else if(NDimSpatial == 2)
{
Hi = params.input_spatial[0];
Wi = params.input_spatial[1];
Y = params.filter_spatial[0];
X = params.filter_spatial[1];
Ho = params.output_spatial[0];
Wo = params.output_spatial[1];
stride_y = params.strides[0];
stride_x = params.strides[1];
dilation_y = params.dilations[0];
dilation_x = params.dilations[1];
pad_y = params.pads[0];
pad_x = params.pads[1];
}
else if(NDimSpatial == 3)
{
Di = params.input_spatial[0];
Hi = params.input_spatial[1];
Wi = params.input_spatial[2];
Z = params.filter_spatial[0];
Y = params.filter_spatial[1];
X = params.filter_spatial[2];
Do = params.output_spatial[0];
Ho = params.output_spatial[1];
Wo = params.output_spatial[2];
stride_z = params.strides[0];
stride_y = params.strides[1];
stride_x = params.strides[2];
dilation_z = params.dilations[0];
dilation_y = params.dilations[1];
dilation_x = params.dilations[2];
pad_z = params.pads[0];
pad_y = params.pads[1];
pad_x = params.pads[2];
}
// Launch GPU reference kernel
const long_index_t output_length = N * Do * Ho * Wo * K;
const index_t grid_size = (output_length + block_size - 1) / block_size;
hipLaunchKernelGGL(ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
WeiDataType,
OutDataType,
float,
InElementOp,
WeiElementOp,
OutElementOp>,
dim3(grid_size),
dim3(block_size),
0,
nullptr,
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
N,
K,
C,
Di,
Hi,
Wi,
Z,
Y,
X,
Do,
Ho,
Wo,
stride_z,
stride_y,
stride_x,
dilation_z,
dilation_y,
dilation_x,
pad_z,
pad_y,
pad_x);
hipDeviceSynchronize();
// Copy result back
output_dev.FromDevice(output_gpu.mData.data());
// Compare GPU ref vs CPU ref
bool pass = check_err(output_gpu.mData, output_ref.mData, "GPU vs CPU ref", 1e-3, 1e-3);
std::cout << " Result: " << (pass ? "✅ PASS" : "❌ FAIL") << std::endl;
return pass;
}
int main(int argc, char* argv[])
{
std::cout << "========================================" << std::endl;
std::cout << "Old CK GPU Reference Test Program" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << std::endl;
int passed = 0;
int failed = 0;
// Test 1: 2D Conv, FP16, Small
{
ConvParams<2> params;
params.N = 2;
params.K = 8;
params.C = 8;
params.input_spatial = {7, 7};
params.filter_spatial = {3, 3};
params.output_spatial = {5, 5};
params.strides = {1, 1};
params.dilations = {1, 1};
params.pads = {0, 0};
if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Small"))
passed++;
else
failed++;
}
// Test 2: 2D Conv, FP32, Medium
{
ConvParams<2> params;
params.N = 4;
params.K = 16;
params.C = 16;
params.input_spatial = {14, 14};
params.filter_spatial = {3, 3};
params.output_spatial = {12, 12};
params.strides = {1, 1};
params.dilations = {1, 1};
params.pads = {0, 0};
if(test_conv_forward_gpu_ref<2, float, float, float>(params, "2D-FP32-Medium"))
passed++;
else
failed++;
}
// Test 3: 1D Conv, FP16
{
ConvParams<1> params;
params.N = 2;
params.K = 8;
params.C = 8;
params.input_spatial = {16};
params.filter_spatial = {3};
params.output_spatial = {14};
params.strides = {1};
params.dilations = {1};
params.pads = {0};
if(test_conv_forward_gpu_ref<1, half_t, half_t, half_t>(params, "1D-FP16"))
passed++;
else
failed++;
}
// Test 4: 3D Conv, FP16, Small
{
ConvParams<3> params;
params.N = 1;
params.K = 8;
params.C = 8;
params.input_spatial = {5, 5, 5};
params.filter_spatial = {3, 3, 3};
params.output_spatial = {3, 3, 3};
params.strides = {1, 1, 1};
params.dilations = {1, 1, 1};
params.pads = {0, 0, 0};
if(test_conv_forward_gpu_ref<3, half_t, half_t, half_t>(params, "3D-FP16-Small"))
passed++;
else
failed++;
}
// Test 5: 2D Conv with stride
{
ConvParams<2> params;
params.N = 2;
params.K = 8;
params.C = 8;
params.input_spatial = {8, 8};
params.filter_spatial = {3, 3};
params.output_spatial = {3, 3};
params.strides = {2, 2};
params.dilations = {1, 1};
params.pads = {0, 0};
if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Stride2"))
passed++;
else
failed++;
}
std::cout << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "SUMMARY" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Total: " << (passed + failed) << std::endl;
std::cout << "Passed: " << passed << "" << std::endl;
std::cout << "Failed: " << failed << std::endl;
std::cout << std::endl;
if(failed == 0)
{
std::cout << "🎉 ALL TESTS PASSED!" << std::endl;
std::cout << "Old CK Forward GPU Reference: WORKING ✅" << std::endl;
return 0;
}
else
{
std::cout << "❌ SOME TESTS FAILED" << std::endl;
return 1;
}
}

View File

@@ -7,11 +7,12 @@
#include <iostream>
#include <memory>
#include <sstream>
#include "conv_util.hpp"
#include "device.hpp"
#include "device_conv_fwd.hpp"
#include "common_header.hpp"
#include "naive_conv_fwd_gpu.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/stream_config.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
namespace ck {
namespace tensor_operation {
@@ -26,7 +27,16 @@ template <typename InDataType,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K
: public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
: public DeviceConvFwd<3,
ck::tensor_layout::convolution::NDHWC,
ck::tensor_layout::convolution::KZYXC,
ck::tensor_layout::convolution::NDHWK,
InDataType,
WeiDataType,
OutDataType,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
{
using DeviceOp = DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K;
@@ -57,6 +67,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op)
: params_{3,
1, // G (group count, always 1 for non-grouped)
N,
K,
C,
@@ -78,7 +89,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
}
// private:
utils::conv::ConvParams params_;
utils::conv::ConvParam params_;
std::vector<index_t> out_spatial_lengths_;
const InDataType* p_in_;
@@ -97,46 +108,28 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto naive_conv3d_fwd =
ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
WeiDataType,
OutDataType,
AccDataType,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>;
using InLayout = ck::tensor_layout::convolution::GNCDHW;
using WeiLayout = ck::tensor_layout::convolution::GKCZYX;
using OutLayout = ck::tensor_layout::convolution::GNKDHW;
float ave_time = launch_and_time_kernel(stream_config,
naive_conv3d_fwd,
dim3(256),
dim3(256),
0,
arg.p_in_,
arg.p_wei_,
arg.p_out_,
arg.N_,
arg.K_,
arg.C_,
arg.in_spatial_lengths_[0],
arg.in_spatial_lengths_[1],
arg.in_spatial_lengths_[2],
arg.filter_spatial_lengths_[0],
arg.filter_spatial_lengths_[1],
arg.filter_spatial_lengths_[2],
arg.out_spatial_lengths_[0],
arg.out_spatial_lengths_[1],
arg.out_spatial_lengths_[2],
arg.conv_filter_strides_[0],
arg.conv_filter_strides_[1],
arg.conv_filter_strides_[2],
arg.conv_filter_dilations_[0],
arg.conv_filter_dilations_[1],
arg.conv_filter_dilations_[2],
arg.in_left_pads_[0],
arg.in_left_pads_[1],
arg.in_left_pads_[2]);
return ave_time;
// Use simplified ConvParam-based API
ref::naive_conv_fwd<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>(arg.p_in_,
arg.p_wei_,
arg.p_out_,
arg.params_,
arg.in_element_op_,
arg.wei_element_op_,
arg.out_element_op_,
stream_config.stream_id_);
return 0; // No timing for naive implementation
}
// polymorphic
@@ -155,7 +148,9 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
static bool IsSupportedArgument(const Argument& arg)
{
std::vector<index_t> out_spatial_lengths = arg.params_.GetOutputSpatialLengths();
auto out_spatial_lengths_long = arg.params_.GetOutputSpatialLengths();
std::vector<index_t> out_spatial_lengths(out_spatial_lengths_long.begin(),
out_spatial_lengths_long.end());
bool out_lengths_are_consistent = out_spatial_lengths[0] == arg.out_spatial_lengths_[0] &&
out_spatial_lengths[1] == arg.out_spatial_lengths_[1] &&

View File

@@ -1,73 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP
#include "ck/ck.hpp"
namespace ck {
namespace ref {
// Device-compatible dimension structure for GPU reference kernels
// Replaces passing 24 individual parameters
struct ConvDims
{
index_t N, K, C;
index_t Di, Hi, Wi;
index_t Z, Y, X;
index_t Do, Ho, Wo;
index_t stride_z, stride_y, stride_x;
index_t dilation_z, dilation_y, dilation_x;
index_t pad_z, pad_y, pad_x;
};
} // namespace ref
// Helper function to extract dimensions from ConvParam for GPU kernels
// Defined in ck::utils::conv namespace for convenience
namespace utils {
namespace conv {
inline ck::ref::ConvDims
extract_conv_dims(const ConvParam& conv_param, ck::index_t NDimSpatial, bool apply_group = true)
{
ck::ref::ConvDims dims;
dims.N = conv_param.N_;
dims.K = conv_param.K_;
dims.C = apply_group ? (conv_param.C_ * conv_param.G_) : conv_param.C_;
dims.Di = (NDimSpatial >= 3) ? conv_param.input_spatial_lengths_[0] : 1;
dims.Hi = (NDimSpatial >= 2) ? conv_param.input_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
dims.Wi = conv_param.input_spatial_lengths_[NDimSpatial - 1];
dims.Z = (NDimSpatial >= 3) ? conv_param.filter_spatial_lengths_[0] : 1;
dims.Y = (NDimSpatial >= 2) ? conv_param.filter_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
dims.X = conv_param.filter_spatial_lengths_[NDimSpatial - 1];
dims.Do = (NDimSpatial >= 3) ? conv_param.output_spatial_lengths_[0] : 1;
dims.Ho = (NDimSpatial >= 2) ? conv_param.output_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
dims.Wo = conv_param.output_spatial_lengths_[NDimSpatial - 1];
dims.stride_z = (NDimSpatial >= 3) ? conv_param.conv_filter_strides_[0] : 1;
dims.stride_y =
(NDimSpatial >= 2) ? conv_param.conv_filter_strides_[NDimSpatial >= 3 ? 1 : 0] : 1;
dims.stride_x = conv_param.conv_filter_strides_[NDimSpatial - 1];
dims.dilation_z = (NDimSpatial >= 3) ? conv_param.conv_filter_dilations_[0] : 1;
dims.dilation_y =
(NDimSpatial >= 2) ? conv_param.conv_filter_dilations_[NDimSpatial >= 3 ? 1 : 0] : 1;
dims.dilation_x = conv_param.conv_filter_dilations_[NDimSpatial - 1];
dims.pad_z = (NDimSpatial >= 3) ? conv_param.input_left_pads_[0] : 0;
dims.pad_y = (NDimSpatial >= 2) ? conv_param.input_left_pads_[NDimSpatial >= 3 ? 1 : 0] : 0;
dims.pad_x = conv_param.input_left_pads_[NDimSpatial - 1];
return dims;
}
} // namespace conv
} // namespace utils
} // namespace ck
#endif

View File

@@ -4,146 +4,515 @@
#pragma once
#include "ck/utility/type_convert.hpp"
#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck {
namespace ref {
/*
* \brief naive implementation of 3D convolution backward data.
* Layout is (NDHWC, KZYXC, NDHWK).
* Computes gradient with respect to input.
*
* \param N number of batches
* \param K number of filters (output channels)
* \param C number of input channels
* \param (Di, Hi, Wi) depth, height and width dimension of input
* \param (Z, Y, X) depth, height and width dimensions of filter
* \param (Do, Ho, Wo) depth, height and width dimension of output
* \param (stride_z, stride_y, stride_x) strides
* \param (dilation_z, dilation_y, dilation_x) dilations
* \param (pad_z, pad_y, pad_x) pads
*/
template <typename TIn,
typename TWei,
typename TOut,
typename TAcc,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
__global__ void naive_conv_bwd_data_ndhwc_kzyxc_ndhwk(TIn* __restrict__ p_in_grad,
const TWei* __restrict__ p_wei,
const TOut* __restrict__ p_out_grad,
const ConvDims dims)
// Optimized backward data convolution kernel working with packed (contiguous) tensors
// Computes gradients w.r.t. input from output gradients and weights
// Assumes row-major packing: input[G][N][C][spatial], weight[G][K][C][filter],
// output[G][N][K][spatial]
template <index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp>
__global__ void naive_conv_bwd_data_packed(InDataType* __restrict__ p_in,
const WeiDataType* __restrict__ p_wei,
const OutDataType* __restrict__ p_out,
index_t G,
index_t N,
index_t K,
index_t C,
index_t Di,
index_t Hi,
index_t Wi,
index_t Z,
index_t Y,
index_t X,
index_t Do,
index_t Ho,
index_t Wo,
index_t stride_z,
index_t stride_y,
index_t stride_x,
index_t dilation_z,
index_t dilation_y,
index_t dilation_x,
index_t pad_z,
index_t pad_y,
index_t pad_x,
InElementOp in_op,
WeiElementOp wei_op,
OutElementOp out_op)
{
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const index_t num_threads = blockDim.x * gridDim.x;
const long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C;
const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const long_index_t num_threads = blockDim.x * gridDim.x;
const index_t in_strides[] = {
dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
const index_t out_strides[] = {
dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
const index_t wei_strides[] = {
dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
InDataType in_val = InDataType{0};
WeiDataType wei_val = WeiDataType{0};
OutDataType out_val = OutDataType{0};
constexpr auto in_op = InElementwiseOperation{};
constexpr auto wei_op = WeiElementwiseOperation{};
constexpr auto out_op = OutElementwiseOperation{};
TIn in_val = TIn{0};
TWei wei_val = TWei{0};
TOut out_val = TOut{0};
for(long_index_t ii = tid; ii < input_length; ii += num_threads)
if constexpr(NDimSpatial == 1)
{
// Decode linear index to (n, di, hi, wi, c)
const index_t n = ii / in_strides[0];
index_t tmp = ii - n * in_strides[0];
const index_t di = tmp / in_strides[1];
tmp -= di * in_strides[1];
const index_t hi = tmp / in_strides[2];
tmp -= hi * in_strides[2];
const index_t wi = tmp / in_strides[3];
tmp -= wi * in_strides[3];
const index_t c = tmp;
const long_index_t num_in = G * N * C * Wi;
const long_index_t out_stride_g = N * K * Wo;
const long_index_t out_stride_n = K * Wo;
const long_index_t out_stride_k = Wo;
const long_index_t wei_stride_g = K * C * X;
const long_index_t wei_stride_k = C * X;
const long_index_t wei_stride_c = X;
const long_index_t in_stride_g = N * C * Wi;
const long_index_t in_stride_n = C * Wi;
const long_index_t in_stride_c = Wi;
// Always accumulate in float
float acc_float = 0.0f;
const TOut* out_n = p_out_grad + static_cast<long_index_t>(n) * out_strides[0];
// Loop over output channels
for(index_t k = 0; k < dims.K; ++k)
for(long_index_t idx = tid; idx < num_in; idx += num_threads)
{
const TWei* wei_k = p_wei + static_cast<long_index_t>(k) * wei_strides[0];
index_t remaining = idx;
const index_t wi = remaining % Wi;
remaining /= Wi;
const index_t c = remaining % C;
remaining /= C;
const index_t n = remaining % N;
const index_t g = remaining / N;
// Loop over filter dimensions
for(index_t z = 0; z < dims.Z; ++z)
float acc = 0.0f;
const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
const WeiDataType* wei_g = p_wei + g * wei_stride_g;
for(index_t x = 0; x < X; ++x)
{
// Calculate output position from input position (inverse of forward)
index_t d_tmp = di + dims.pad_z - z * dims.dilation_z;
if(d_tmp % dims.stride_z != 0)
continue;
index_t d_o = d_tmp / dims.stride_z;
if(d_o < 0 || d_o >= dims.Do)
continue;
const TOut* out_n_do = out_n + d_o * out_strides[1];
const TWei* wei_k_z = wei_k + z * wei_strides[1];
for(index_t y = 0; y < dims.Y; ++y)
long_index_t w_tmp = wi + pad_x - x * dilation_x;
if(w_tmp % stride_x == 0)
{
index_t h_tmp = hi + dims.pad_y - y * dims.dilation_y;
if(h_tmp % dims.stride_y != 0)
continue;
index_t ho = h_tmp / dims.stride_y;
if(ho < 0 || ho >= dims.Ho)
continue;
const TOut* out_n_do_ho = out_n_do + ho * out_strides[2];
const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2];
for(index_t x = 0; x < dims.X; ++x)
long_index_t wo = w_tmp / stride_x;
if(wo >= 0 && wo < Wo)
{
index_t w_tmp = wi + dims.pad_x - x * dims.dilation_x;
if(w_tmp % dims.stride_x != 0)
continue;
index_t wo = w_tmp / dims.stride_x;
if(wo < 0 || wo >= dims.Wo)
continue;
const OutDataType* out_gnk = out_gn;
const WeiDataType* wei_gkc = wei_g + c * wei_stride_c;
const TOut* out_n_do_ho_wo = out_n_do_ho + wo * out_strides[3];
const TWei* wei_k_z_y_x = wei_k_z_y + x * wei_strides[3];
// Load values from memory
TOut out_loaded = out_n_do_ho_wo[k];
TWei wei_loaded = wei_k_z_y_x[c];
// Apply element-wise operations (like forward does)
out_op(out_val, out_loaded);
wei_op(wei_val, wei_loaded);
// Convert to float for multiplication
float out_f = type_convert<float>(out_val);
float wei_f = type_convert<float>(wei_val);
acc_float += out_f * wei_f;
for(index_t k = 0; k < K; ++k)
{
out_op(out_val, out_gnk[k * out_stride_k + wo]);
wei_op(wei_val, wei_gkc[k * wei_stride_k + x]);
acc += type_convert<float>(out_val) * type_convert<float>(wei_val);
}
}
}
}
InDataType result = type_convert<InDataType>(acc);
in_op(in_val, result);
p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + wi] = in_val;
}
}
else if constexpr(NDimSpatial == 2)
{
const long_index_t num_in = G * N * C * Hi * Wi;
const long_index_t out_stride_g = N * K * Ho * Wo;
const long_index_t out_stride_n = K * Ho * Wo;
const long_index_t out_stride_k = Ho * Wo;
const long_index_t out_stride_h = Wo;
const long_index_t wei_stride_g = K * C * Y * X;
const long_index_t wei_stride_k = C * Y * X;
const long_index_t wei_stride_c = Y * X;
const long_index_t wei_stride_y = X;
const long_index_t in_stride_g = N * C * Hi * Wi;
const long_index_t in_stride_n = C * Hi * Wi;
const long_index_t in_stride_c = Hi * Wi;
const long_index_t in_stride_h = Wi;
// Convert float accumulator to TAcc, then to input type
TAcc acc = type_convert<TAcc>(acc_float);
TIn result = type_convert<TIn>(acc);
for(long_index_t idx = tid; idx < num_in; idx += num_threads)
{
index_t remaining = idx;
const index_t wi = remaining % Wi;
remaining /= Wi;
const index_t hi = remaining % Hi;
remaining /= Hi;
const index_t c = remaining % C;
remaining /= C;
const index_t n = remaining % N;
const index_t g = remaining / N;
// Apply input element-wise operation (if any)
in_op(in_val, result);
float acc = 0.0f;
const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
const WeiDataType* wei_g = p_wei + g * wei_stride_g;
// Write transformed result
p_in_grad[ii] = in_val;
for(index_t y = 0; y < Y; ++y)
{
long_index_t h_tmp = hi + pad_y - y * dilation_y;
if(h_tmp % stride_y == 0)
{
long_index_t ho = h_tmp / stride_y;
if(ho >= 0 && ho < Ho)
{
const OutDataType* out_gnkh = out_gn + ho * out_stride_h;
const WeiDataType* wei_gkcy = wei_g + c * wei_stride_c + y * wei_stride_y;
for(index_t x = 0; x < X; ++x)
{
long_index_t w_tmp = wi + pad_x - x * dilation_x;
if(w_tmp % stride_x == 0)
{
long_index_t wo = w_tmp / stride_x;
if(wo >= 0 && wo < Wo)
{
for(index_t k = 0; k < K; ++k)
{
out_op(out_val, out_gnkh[k * out_stride_k + wo]);
wei_op(wei_val, wei_gkcy[k * wei_stride_k + x]);
acc += type_convert<float>(out_val) *
type_convert<float>(wei_val);
}
}
}
}
}
}
}
InDataType result = type_convert<InDataType>(acc);
in_op(in_val, result);
p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + hi * in_stride_h + wi] =
in_val;
}
}
else if constexpr(NDimSpatial == 3)
{
const long_index_t num_in = G * N * C * Di * Hi * Wi;
const long_index_t out_stride_g = N * K * Do * Ho * Wo;
const long_index_t out_stride_n = K * Do * Ho * Wo;
const long_index_t out_stride_k = Do * Ho * Wo;
const long_index_t out_stride_d = Ho * Wo;
const long_index_t out_stride_h = Wo;
const long_index_t wei_stride_g = K * C * Z * Y * X;
const long_index_t wei_stride_k = C * Z * Y * X;
const long_index_t wei_stride_c = Z * Y * X;
const long_index_t wei_stride_z = Y * X;
const long_index_t wei_stride_y = X;
const long_index_t in_stride_g = N * C * Di * Hi * Wi;
const long_index_t in_stride_n = C * Di * Hi * Wi;
const long_index_t in_stride_c = Di * Hi * Wi;
const long_index_t in_stride_d = Hi * Wi;
const long_index_t in_stride_h = Wi;
for(long_index_t idx = tid; idx < num_in; idx += num_threads)
{
index_t remaining = idx;
const index_t wi = remaining % Wi;
remaining /= Wi;
const index_t hi = remaining % Hi;
remaining /= Hi;
const index_t di = remaining % Di;
remaining /= Di;
const index_t c = remaining % C;
remaining /= C;
const index_t n = remaining % N;
const index_t g = remaining / N;
float acc = 0.0f;
const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
const WeiDataType* wei_g = p_wei + g * wei_stride_g;
for(index_t z = 0; z < Z; ++z)
{
long_index_t d_tmp = di + pad_z - z * dilation_z;
if(d_tmp % stride_z == 0)
{
long_index_t do_idx = d_tmp / stride_z;
if(do_idx >= 0 && do_idx < Do)
{
const OutDataType* out_gnkd = out_gn + do_idx * out_stride_d;
const WeiDataType* wei_gkcz = wei_g + c * wei_stride_c + z * wei_stride_z;
for(index_t y = 0; y < Y; ++y)
{
long_index_t h_tmp = hi + pad_y - y * dilation_y;
if(h_tmp % stride_y == 0)
{
long_index_t ho = h_tmp / stride_y;
if(ho >= 0 && ho < Ho)
{
const OutDataType* out_gnkdh = out_gnkd + ho * out_stride_h;
const WeiDataType* wei_gkczy = wei_gkcz + y * wei_stride_y;
for(index_t x = 0; x < X; ++x)
{
long_index_t w_tmp = wi + pad_x - x * dilation_x;
if(w_tmp % stride_x == 0)
{
long_index_t wo = w_tmp / stride_x;
if(wo >= 0 && wo < Wo)
{
for(index_t k = 0; k < K; ++k)
{
out_op(out_val,
out_gnkdh[k * out_stride_k + wo]);
wei_op(wei_val,
wei_gkczy[k * wei_stride_k + x]);
acc += type_convert<float>(out_val) *
type_convert<float>(wei_val);
}
}
}
}
}
}
}
}
}
}
InDataType result = type_convert<InDataType>(acc);
in_op(in_val, result);
p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + di * in_stride_d +
hi * in_stride_h + wi] = in_val;
}
}
}
// GPU reference backward data convolution - takes ConvParam directly
template <typename InLayout,
typename WeiLayout,
typename OutLayout,
typename TIn,
typename TWei,
typename TOut,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
void naive_conv_bwd_data(TIn* p_in,
const TWei* p_wei,
const TOut* p_out,
const ck::utils::conv::ConvParam& conv_param,
InElementwiseOperation in_element_op = InElementwiseOperation{},
WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
OutElementwiseOperation out_element_op = OutElementwiseOperation{},
hipStream_t stream = nullptr)
{
const auto ndim = conv_param.num_dim_spatial_;
const index_t G = conv_param.G_;
const index_t N = conv_param.N_;
const index_t C = conv_param.C_;
const index_t K = conv_param.K_;
std::vector<index_t> in_lengths = {G, N, C};
std::vector<index_t> wei_lengths = {G, K, C};
std::vector<index_t> out_lengths = {G, N, K};
for(index_t i = 0; i < ndim; ++i)
{
in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
}
// Calculate total elements for buffer allocation
long_index_t in_total = 1, wei_total = 1, out_total = 1;
for(auto l : in_lengths)
in_total *= l;
for(auto l : wei_lengths)
wei_total *= l;
for(auto l : out_lengths)
out_total *= l;
// Allocate packed buffers
SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
SimpleDeviceMem wei_packed_buf(wei_total * sizeof(TWei));
SimpleDeviceMem out_packed_buf(out_total * sizeof(TOut));
TIn* p_in_packed = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
TWei* p_wei_packed = static_cast<TWei*>(wei_packed_buf.GetDeviceBuffer());
TOut* p_out_packed = static_cast<TOut*>(out_packed_buf.GetDeviceBuffer());
// Compute strides and allocate device arrays for pack/unpack
std::vector<index_t> in_strides = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
const size_t dim_count = in_lengths.size();
SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
index_t* d_in_lengths = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
index_t* d_in_strides = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
HIP_CHECK_ERROR(hipMemcpy(
d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
// Pack output and weight tensors to contiguous layout (inputs to bwd data)
constexpr int block_size = 256;
strided_copy_kernel<TOut, false>
<<<(out_total + block_size - 1) / block_size, block_size, 0, stream>>>(
p_out, p_out_packed, d_out_lengths, d_out_strides, dim_count, out_total);
strided_copy_kernel<TWei, false>
<<<(wei_total + block_size - 1) / block_size, block_size, 0, stream>>>(
p_wei, p_wei_packed, d_wei_lengths, d_wei_strides, dim_count, wei_total);
// Build conv parameter vectors for kernel invocation
std::vector<index_t> conv_strides(ndim);
std::vector<index_t> conv_dilations(ndim);
std::vector<index_t> input_pads(ndim);
for(index_t i = 0; i < ndim; ++i)
{
conv_strides[i] = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
input_pads[i] = static_cast<index_t>(conv_param.input_left_pads_[i]);
}
// Run backward data convolution kernel on packed data
const int in_grid = (in_total + block_size - 1) / block_size;
if(ndim == 1)
{
naive_conv_bwd_data_packed<1,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<in_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_packed,
p_out_packed,
G,
N,
K,
C,
1,
1,
in_lengths[3],
1,
1,
wei_lengths[3],
1,
1,
out_lengths[3],
1,
1,
conv_strides[0],
1,
1,
conv_dilations[0],
0,
0,
input_pads[0],
in_element_op,
wei_element_op,
out_element_op);
}
else if(ndim == 2)
{
naive_conv_bwd_data_packed<2,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<in_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_packed,
p_out_packed,
G,
N,
K,
C,
1,
in_lengths[3],
in_lengths[4],
1,
wei_lengths[3],
wei_lengths[4],
1,
out_lengths[3],
out_lengths[4],
1,
conv_strides[0],
conv_strides[1],
1,
conv_dilations[0],
conv_dilations[1],
0,
input_pads[0],
input_pads[1],
in_element_op,
wei_element_op,
out_element_op);
}
else // 3D
{
naive_conv_bwd_data_packed<3,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<in_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_packed,
p_out_packed,
G,
N,
K,
C,
in_lengths[3],
in_lengths[4],
in_lengths[5],
wei_lengths[3],
wei_lengths[4],
wei_lengths[5],
out_lengths[3],
out_lengths[4],
out_lengths[5],
conv_strides[0],
conv_strides[1],
conv_strides[2],
conv_dilations[0],
conv_dilations[1],
conv_dilations[2],
input_pads[0],
input_pads[1],
input_pads[2],
in_element_op,
wei_element_op,
out_element_op);
}
// Unpack result back to strided layout
strided_copy_kernel<TIn, true><<<in_grid, block_size, 0, stream>>>(
p_in_packed, p_in, d_in_lengths, d_in_strides, dim_count, in_total);
HIP_CHECK_ERROR(hipGetLastError());
// Memory automatically freed by SimpleDeviceMem destructors
}
} // namespace ref
} // namespace ck

View File

@@ -4,136 +4,497 @@
#pragma once
#include "ck/utility/type_convert.hpp"
#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck {
namespace ref {
/*
* \brief naive implementation of 3D convolution backward weight.
* Layout is (NDHWC, KZYXC, NDHWK).
* Computes gradient with respect to weights.
*
* \param N number of batches
* \param K number of filters (output channels)
* \param C number of input channels
* \param (Di, Hi, Wi) depth, height and width dimension of input
* \param (Z, Y, X) depth, height and width dimensions of filter
* \param (Do, Ho, Wo) depth, height and width dimension of output
* \param (stride_z, stride_y, stride_x) strides
* \param (dilation_z, dilation_y, dilation_x) dilations
* \param (pad_z, pad_y, pad_x) pads
*/
template <typename TIn,
typename TWei,
typename TOut,
typename TAcc,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
__global__ void naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in,
TWei* __restrict__ p_wei_grad,
const TOut* __restrict__ p_out_grad,
const ConvDims dims)
// Optimized backward weight convolution kernel working with packed (contiguous) tensors
// Assumes row-major packing: input[G][N][C][spatial], output_grad[G][N][K][spatial],
// weight_grad[G][K][C][filter]
// Computes gradient with respect to weights
template <index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp>
__global__ void naive_conv_bwd_weight_packed(const InDataType* __restrict__ p_in,
WeiDataType* __restrict__ p_wei_grad,
const OutDataType* __restrict__ p_out_grad,
index_t G,
index_t N,
index_t K,
index_t C,
index_t Di,
index_t Hi,
index_t Wi,
index_t Z,
index_t Y,
index_t X,
index_t Do,
index_t Ho,
index_t Wo,
index_t stride_z,
index_t stride_y,
index_t stride_x,
index_t dilation_z,
index_t dilation_y,
index_t dilation_x,
index_t pad_z,
index_t pad_y,
index_t pad_x,
InElementOp in_op,
WeiElementOp wei_op,
OutElementOp out_op)
{
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const index_t num_threads = blockDim.x * gridDim.x;
const long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C;
const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const long_index_t num_threads = blockDim.x * gridDim.x;
const index_t in_strides[] = {
dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
const index_t out_strides[] = {
dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
const index_t wei_strides[] = {
dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
InDataType in_val = InDataType{0};
WeiDataType wei_val = WeiDataType{0};
OutDataType out_val = OutDataType{0};
constexpr auto in_op = InElementwiseOperation{};
constexpr auto wei_op = WeiElementwiseOperation{};
constexpr auto out_op = OutElementwiseOperation{};
TIn in_val = TIn{0};
TWei wei_val = TWei{0};
TOut out_val = TOut{0};
for(long_index_t ii = tid; ii < weight_length; ii += num_threads)
if constexpr(NDimSpatial == 1)
{
// Decode linear index to (k, z, y, x, c)
const index_t k = ii / wei_strides[0];
index_t tmp = ii - k * wei_strides[0];
const index_t z = tmp / wei_strides[1];
tmp -= z * wei_strides[1];
const index_t y = tmp / wei_strides[2];
tmp -= y * wei_strides[2];
const index_t x = tmp / wei_strides[3];
tmp -= x * wei_strides[3];
const index_t c = tmp;
const long_index_t num_wei = G * K * C * X;
const long_index_t in_stride_g = N * C * Wi;
const long_index_t in_stride_n = C * Wi;
const long_index_t in_stride_c = Wi;
const long_index_t out_stride_g = N * K * Wo;
const long_index_t out_stride_n = K * Wo;
const long_index_t out_stride_k = Wo;
const long_index_t wei_stride_g = K * C * X;
const long_index_t wei_stride_k = C * X;
const long_index_t wei_stride_c = X;
// Always accumulate in float
float acc_float = 0.0f;
// Loop over batch
for(index_t n = 0; n < dims.N; ++n)
for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
{
const TIn* in_n = p_in + static_cast<long_index_t>(n) * in_strides[0];
const TOut* out_n = p_out_grad + static_cast<long_index_t>(n) * out_strides[0];
index_t remaining = idx;
const index_t x = remaining % X;
remaining /= X;
const index_t c = remaining % C;
remaining /= C;
const index_t k = remaining % K;
const index_t g = remaining / K;
// Loop over output spatial dimensions
for(index_t d_o = 0; d_o < dims.Do; ++d_o)
float acc = 0.0f;
const InDataType* in_g = p_in + g * in_stride_g;
const OutDataType* out_grad = p_out_grad + g * out_stride_g;
// Loop over batch and output positions
for(index_t n = 0; n < N; ++n)
{
// Calculate input position from output position
index_t di = d_o * dims.stride_z - dims.pad_z + z * dims.dilation_z;
if(di < 0 || di >= dims.Di)
continue;
const InDataType* in_gn = in_g + n * in_stride_n + c * in_stride_c;
const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
const TIn* in_n_di = in_n + di * in_strides[1];
const TOut* out_n_do = out_n + d_o * out_strides[1];
for(index_t ho = 0; ho < dims.Ho; ++ho)
for(index_t wo = 0; wo < Wo; ++wo)
{
index_t hi = ho * dims.stride_y - dims.pad_y + y * dims.dilation_y;
if(hi < 0 || hi >= dims.Hi)
continue;
const TIn* in_n_di_hi = in_n_di + hi * in_strides[2];
const TOut* out_n_do_ho = out_n_do + ho * out_strides[2];
for(index_t wo = 0; wo < dims.Wo; ++wo)
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
if(wi >= 0 && wi < Wi)
{
index_t wi = wo * dims.stride_x - dims.pad_x + x * dims.dilation_x;
if(wi < 0 || wi >= dims.Wi)
continue;
// Load values from memory (like forward does)
const TIn* in_ptr = in_n_di_hi + wi * in_strides[3];
const TOut* out_ptr = out_n_do_ho + wo * out_strides[3];
TIn in_loaded = in_ptr[c];
TOut out_loaded = out_ptr[k];
// Apply element-wise operations
in_op(in_val, in_loaded);
out_op(out_val, out_loaded);
// Convert to float for multiplication
float in_f = type_convert<float>(in_val);
float out_f = type_convert<float>(out_val);
acc_float += out_f * in_f;
in_op(in_val, in_gn[wi]);
out_op(out_val, out_gn_k[wo]);
acc += type_convert<float>(out_val) * type_convert<float>(in_val);
}
}
}
WeiDataType result = type_convert<WeiDataType>(acc);
wei_op(wei_val, result);
p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + x] = wei_val;
}
}
else if constexpr(NDimSpatial == 2)
{
const long_index_t num_wei = G * K * C * Y * X;
const long_index_t in_stride_g = N * C * Hi * Wi;
const long_index_t in_stride_n = C * Hi * Wi;
const long_index_t in_stride_c = Hi * Wi;
const long_index_t in_stride_h = Wi;
const long_index_t out_stride_g = N * K * Ho * Wo;
const long_index_t out_stride_n = K * Ho * Wo;
const long_index_t out_stride_k = Ho * Wo;
const long_index_t out_stride_h = Wo;
const long_index_t wei_stride_g = K * C * Y * X;
const long_index_t wei_stride_k = C * Y * X;
const long_index_t wei_stride_c = Y * X;
const long_index_t wei_stride_y = X;
// Convert float accumulator to TAcc, then to weight type
TAcc acc = type_convert<TAcc>(acc_float);
TWei result = type_convert<TWei>(acc);
for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
{
index_t remaining = idx;
const index_t x = remaining % X;
remaining /= X;
const index_t y = remaining % Y;
remaining /= Y;
const index_t c = remaining % C;
remaining /= C;
const index_t k = remaining % K;
const index_t g = remaining / K;
// Apply weight element-wise operation (if any)
wei_op(wei_val, result);
float acc = 0.0f;
const InDataType* in_g = p_in + g * in_stride_g;
const OutDataType* out_grad = p_out_grad + g * out_stride_g;
// Write transformed result
p_wei_grad[ii] = wei_val;
// Loop over batch and output positions
for(index_t n = 0; n < N; ++n)
{
const InDataType* in_gnc = in_g + n * in_stride_n + c * in_stride_c;
const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
for(index_t ho = 0; ho < Ho; ++ho)
{
long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
if(hi >= 0 && hi < Hi)
{
const InDataType* in_gnch = in_gnc + hi * in_stride_h;
const OutDataType* out_gn_kh = out_gn_k + ho * out_stride_h;
for(index_t wo = 0; wo < Wo; ++wo)
{
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
if(wi >= 0 && wi < Wi)
{
in_op(in_val, in_gnch[wi]);
out_op(out_val, out_gn_kh[wo]);
acc += type_convert<float>(out_val) * type_convert<float>(in_val);
}
}
}
}
}
WeiDataType result = type_convert<WeiDataType>(acc);
wei_op(wei_val, result);
p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + y * wei_stride_y +
x] = wei_val;
}
}
else if constexpr(NDimSpatial == 3)
{
const long_index_t num_wei = G * K * C * Z * Y * X;
const long_index_t in_stride_g = N * C * Di * Hi * Wi;
const long_index_t in_stride_n = C * Di * Hi * Wi;
const long_index_t in_stride_c = Di * Hi * Wi;
const long_index_t in_stride_d = Hi * Wi;
const long_index_t in_stride_h = Wi;
const long_index_t out_stride_g = N * K * Do * Ho * Wo;
const long_index_t out_stride_n = K * Do * Ho * Wo;
const long_index_t out_stride_k = Do * Ho * Wo;
const long_index_t out_stride_d = Ho * Wo;
const long_index_t out_stride_h = Wo;
const long_index_t wei_stride_g = K * C * Z * Y * X;
const long_index_t wei_stride_k = C * Z * Y * X;
const long_index_t wei_stride_c = Z * Y * X;
const long_index_t wei_stride_z = Y * X;
const long_index_t wei_stride_y = X;
for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
{
index_t remaining = idx;
const index_t x = remaining % X;
remaining /= X;
const index_t y = remaining % Y;
remaining /= Y;
const index_t z = remaining % Z;
remaining /= Z;
const index_t c = remaining % C;
remaining /= C;
const index_t k = remaining % K;
const index_t g = remaining / K;
float acc = 0.0f;
const InDataType* in_g = p_in + g * in_stride_g;
const OutDataType* out_grad = p_out_grad + g * out_stride_g;
// Loop over batch and output positions
for(index_t n = 0; n < N; ++n)
{
const InDataType* in_gnc = in_g + n * in_stride_n + c * in_stride_c;
const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
for(index_t do_idx = 0; do_idx < Do; ++do_idx)
{
long_index_t di = do_idx * stride_z + z * dilation_z - pad_z;
if(di >= 0 && di < Di)
{
const InDataType* in_gncd = in_gnc + di * in_stride_d;
const OutDataType* out_gn_kd = out_gn_k + do_idx * out_stride_d;
for(index_t ho = 0; ho < Ho; ++ho)
{
long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
if(hi >= 0 && hi < Hi)
{
const InDataType* in_gncdh = in_gncd + hi * in_stride_h;
const OutDataType* out_gn_kdh = out_gn_kd + ho * out_stride_h;
for(index_t wo = 0; wo < Wo; ++wo)
{
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
if(wi >= 0 && wi < Wi)
{
in_op(in_val, in_gncdh[wi]);
out_op(out_val, out_gn_kdh[wo]);
acc += type_convert<float>(out_val) *
type_convert<float>(in_val);
}
}
}
}
}
}
}
WeiDataType result = type_convert<WeiDataType>(acc);
wei_op(wei_val, result);
p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + z * wei_stride_z +
y * wei_stride_y + x] = wei_val;
}
}
}
// GPU reference backward weight convolution - takes ConvParam directly
template <typename InLayout,
typename WeiLayout,
typename OutLayout,
typename TIn,
typename TWei,
typename TOut,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
void naive_conv_bwd_weight(const TIn* p_in,
TWei* p_wei_grad,
const TOut* p_out,
const ck::utils::conv::ConvParam& conv_param,
InElementwiseOperation in_element_op = InElementwiseOperation{},
WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
OutElementwiseOperation out_element_op = OutElementwiseOperation{},
hipStream_t stream = nullptr)
{
const auto ndim = conv_param.num_dim_spatial_;
const index_t G = conv_param.G_;
const index_t N = conv_param.N_;
const index_t C = conv_param.C_;
const index_t K = conv_param.K_;
std::vector<index_t> in_lengths = {G, N, C};
std::vector<index_t> wei_lengths = {G, K, C};
std::vector<index_t> out_lengths = {G, N, K};
for(index_t i = 0; i < ndim; ++i)
{
in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
}
// Calculate total elements for buffer allocation
long_index_t in_total = 1, wei_total = 1, out_total = 1;
for(auto l : in_lengths)
in_total *= l;
for(auto l : wei_lengths)
wei_total *= l;
for(auto l : out_lengths)
out_total *= l;
// Allocate packed buffers
SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
SimpleDeviceMem wei_grad_packed_buf(wei_total * sizeof(TWei));
SimpleDeviceMem out_grad_packed_buf(out_total * sizeof(TOut));
TIn* p_in_packed = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
TWei* p_wei_grad_packed = static_cast<TWei*>(wei_grad_packed_buf.GetDeviceBuffer());
TOut* p_out_grad_packed = static_cast<TOut*>(out_grad_packed_buf.GetDeviceBuffer());
// Compute strides and allocate device arrays for pack/unpack
std::vector<index_t> in_strides = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
const size_t dim_count = in_lengths.size();
SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
index_t* d_in_lengths = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
index_t* d_in_strides = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
HIP_CHECK_ERROR(hipMemcpy(
d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
// Pack input and output_grad tensors to contiguous layout (inputs to bwd weight)
constexpr int block_size = 256;
strided_copy_kernel<TIn, false>
<<<(in_total + block_size - 1) / block_size, block_size, 0, stream>>>(
p_in, p_in_packed, d_in_lengths, d_in_strides, dim_count, in_total);
strided_copy_kernel<TOut, false>
<<<(out_total + block_size - 1) / block_size, block_size, 0, stream>>>(
p_out, p_out_grad_packed, d_out_lengths, d_out_strides, dim_count, out_total);
// Build conv parameter vectors for kernel invocation
std::vector<index_t> conv_strides(ndim);
std::vector<index_t> conv_dilations(ndim);
std::vector<index_t> input_pads(ndim);
for(index_t i = 0; i < ndim; ++i)
{
conv_strides[i] = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
input_pads[i] = static_cast<index_t>(conv_param.input_left_pads_[i]);
}
// Run backward weight convolution kernel on packed data
const int wei_grid = (wei_total + block_size - 1) / block_size;
if(ndim == 1)
{
naive_conv_bwd_weight_packed<1,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<wei_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_grad_packed,
p_out_grad_packed,
G,
N,
K,
C,
1,
1,
in_lengths[3],
1,
1,
wei_lengths[3],
1,
1,
out_lengths[3],
1,
1,
conv_strides[0],
1,
1,
conv_dilations[0],
0,
0,
input_pads[0],
in_element_op,
wei_element_op,
out_element_op);
}
else if(ndim == 2)
{
naive_conv_bwd_weight_packed<2,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<wei_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_grad_packed,
p_out_grad_packed,
G,
N,
K,
C,
1,
in_lengths[3],
in_lengths[4],
1,
wei_lengths[3],
wei_lengths[4],
1,
out_lengths[3],
out_lengths[4],
1,
conv_strides[0],
conv_strides[1],
1,
conv_dilations[0],
conv_dilations[1],
0,
input_pads[0],
input_pads[1],
in_element_op,
wei_element_op,
out_element_op);
}
else // 3D
{
naive_conv_bwd_weight_packed<3,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<wei_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_grad_packed,
p_out_grad_packed,
G,
N,
K,
C,
in_lengths[3],
in_lengths[4],
in_lengths[5],
wei_lengths[3],
wei_lengths[4],
wei_lengths[5],
out_lengths[3],
out_lengths[4],
out_lengths[5],
conv_strides[0],
conv_strides[1],
conv_strides[2],
conv_dilations[0],
conv_dilations[1],
conv_dilations[2],
input_pads[0],
input_pads[1],
input_pads[2],
in_element_op,
wei_element_op,
out_element_op);
}
// Unpack weight gradient
strided_copy_kernel<TWei, true><<<wei_grid, block_size, 0, stream>>>(
p_wei_grad_packed, p_wei_grad, d_wei_lengths, d_wei_strides, dim_count, wei_total);
HIP_CHECK_ERROR(hipGetLastError());
// Memory automatically freed by SimpleDeviceMem destructors
}
} // namespace ref
} // namespace ck

View File

@@ -4,126 +4,493 @@
#pragma once
#include "ck/utility/type_convert.hpp"
#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck {
namespace ref {
/*
* \brief naive implementation of 3D convolution. Layout is (NDHWC, KZYXC, NDHWK).
*
* \param N number of batches
* \param K number of filters
* \param C number of channels of weight
* \param (Di, Hi, Wi) depth, height and width dimension of data
* \param (Z, Y, X) depth, height and width dimensions of weights
* \param (Do, Ho, Wo) depth, height and width dimension of output
* \param (stride_z, stride_y, stride_x) strides
* \param (dilation_z, dilation_y, dilation_x) dilations
* \param (pad_z, pad_y, pad_x) pads
*/
template <typename TIn,
typename TWei,
typename TOut,
typename TAcc,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
__global__ void naive_conv_fwd_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in,
const TWei* __restrict__ p_wei,
TOut* __restrict__ p_out,
const ConvDims dims)
// Optimized convolution kernel working with packed (contiguous) tensors
// Assumes row-major packing: input[G][N][C][spatial], weight[G][K][C][filter],
// output[G][N][K][spatial]
template <index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp>
__global__ void naive_conv_fwd_packed(const InDataType* __restrict__ p_in,
const WeiDataType* __restrict__ p_wei,
OutDataType* __restrict__ p_out,
index_t G,
index_t N,
index_t K,
index_t C,
index_t Di,
index_t Hi,
index_t Wi,
index_t Z,
index_t Y,
index_t X,
index_t Do,
index_t Ho,
index_t Wo,
index_t stride_z,
index_t stride_y,
index_t stride_x,
index_t dilation_z,
index_t dilation_y,
index_t dilation_x,
index_t pad_z,
index_t pad_y,
index_t pad_x,
InElementOp in_op,
WeiElementOp wei_op,
OutElementOp out_op)
{
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const index_t num_threads = blockDim.x * gridDim.x;
const long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K;
const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const long_index_t num_threads = blockDim.x * gridDim.x;
const index_t out_strides[] = {
dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
const index_t in_strides[] = {
dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
const index_t wei_strides[] = {
dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
InDataType in_val = InDataType{0};
WeiDataType wei_val = WeiDataType{0};
OutDataType out_val = OutDataType{0};
constexpr auto in_op = InElementwiseOperation{};
constexpr auto wei_op = WeiElementwiseOperation{};
constexpr auto out_op = OutElementwiseOperation{};
TIn in_val = TIn{0};
TWei wei_val = TWei{0};
TOut out_val = TOut{0};
for(long_index_t ii = tid; ii < output_length; ii += num_threads)
if constexpr(NDimSpatial == 1)
{
const index_t n = ii / out_strides[0];
index_t k = ii - n * out_strides[0];
const index_t dO = k / out_strides[1];
k -= dO * out_strides[1];
const index_t ho = k / out_strides[2];
k -= ho * out_strides[2];
const index_t wo = k / out_strides[3];
k -= wo * out_strides[3];
const long_index_t num_out = G * N * K * Wo;
const long_index_t in_stride_g = N * C * Wi;
const long_index_t in_stride_n = C * Wi;
const long_index_t in_stride_c = Wi;
const long_index_t wei_stride_g = K * C * X;
const long_index_t wei_stride_k = C * X;
const long_index_t wei_stride_c = X;
const long_index_t out_stride_g = N * K * Wo;
const long_index_t out_stride_n = K * Wo;
const long_index_t out_stride_k = Wo;
// Always accumulate in float (FP8/BF8 don't support arithmetic)
float acc_float = 0.0f;
const TIn* in_n = p_in + static_cast<long_index_t>(n) * in_strides[0];
const TWei* wei_k = p_wei + static_cast<long_index_t>(k) * wei_strides[0];
for(index_t z = 0; z < dims.Z; ++z)
for(long_index_t idx = tid; idx < num_out; idx += num_threads)
{
index_t di = dims.stride_z * dO - dims.pad_z + dims.dilation_z * z;
const TIn* in_n_di = in_n + di * in_strides[1];
const TWei* wei_k_z = wei_k + z * wei_strides[1];
index_t remaining = idx;
const index_t wo = remaining % Wo;
remaining /= Wo;
const index_t k = remaining % K;
remaining /= K;
const index_t n = remaining % N;
const index_t g = remaining / N;
for(index_t y = 0; y < dims.Y; ++y)
float acc = 0.0f;
const InDataType* in_g = p_in + g * in_stride_g + n * in_stride_n;
const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
for(index_t c = 0; c < C; ++c)
{
index_t hi = dims.stride_y * ho - dims.pad_y + dims.dilation_y * y;
const TIn* in_n_di_hi = in_n_di + hi * in_strides[2];
const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2];
const InDataType* in_gc = in_g + c * in_stride_c;
const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
for(index_t x = 0; x < dims.X; ++x)
for(index_t x = 0; x < X; ++x)
{
index_t wi = dims.stride_x * wo - dims.pad_x + dims.dilation_x * x;
const TIn* in_n_di_hi_wi = in_n_di_hi + wi * in_strides[3];
const TWei* wei_k_z_y_x = wei_k_z_y + x * wei_strides[3];
if(di >= 0 && di < dims.Di && hi >= 0 && hi < dims.Hi && wi >= 0 &&
wi < dims.Wi)
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
if(wi >= 0 && wi < Wi)
{
for(index_t c = 0; c < dims.C; ++c)
in_op(in_val, in_gc[wi]);
wei_op(wei_val, wei_gkc[x]);
acc += type_convert<float>(in_val) * type_convert<float>(wei_val);
}
}
}
OutDataType result = type_convert<OutDataType>(acc);
out_op(out_val, result);
p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + wo] = out_val;
}
}
else if constexpr(NDimSpatial == 2)
{
const long_index_t num_out = G * N * K * Ho * Wo;
const long_index_t in_stride_g = N * C * Hi * Wi;
const long_index_t in_stride_n = C * Hi * Wi;
const long_index_t in_stride_c = Hi * Wi;
const long_index_t in_stride_h = Wi;
const long_index_t wei_stride_g = K * C * Y * X;
const long_index_t wei_stride_k = C * Y * X;
const long_index_t wei_stride_c = Y * X;
const long_index_t wei_stride_y = X;
const long_index_t out_stride_g = N * K * Ho * Wo;
const long_index_t out_stride_n = K * Ho * Wo;
const long_index_t out_stride_k = Ho * Wo;
const long_index_t out_stride_h = Wo;
for(long_index_t idx = tid; idx < num_out; idx += num_threads)
{
index_t remaining = idx;
const index_t wo = remaining % Wo;
remaining /= Wo;
const index_t ho = remaining % Ho;
remaining /= Ho;
const index_t k = remaining % K;
remaining /= K;
const index_t n = remaining % N;
const index_t g = remaining / N;
float acc = 0.0f;
const InDataType* in_gn = p_in + g * in_stride_g + n * in_stride_n;
const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
for(index_t c = 0; c < C; ++c)
{
const InDataType* in_gnc = in_gn + c * in_stride_c;
const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
for(index_t y = 0; y < Y; ++y)
{
long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
if(hi >= 0 && hi < Hi)
{
const InDataType* in_gnch = in_gnc + hi * in_stride_h;
const WeiDataType* wei_gkcy = wei_gkc + y * wei_stride_y;
for(index_t x = 0; x < X; ++x)
{
// Load values from memory
TIn in_loaded = in_n_di_hi_wi[c];
TWei wei_loaded = wei_k_z_y_x[c];
// Apply element-wise operations
in_op(in_val, in_loaded);
wei_op(wei_val, wei_loaded);
// Always convert to float for multiplication (FP8/BF8 don't support
// direct arithmetic)
float in_f = type_convert<float>(in_val);
float wei_f = type_convert<float>(wei_val);
// Accumulate in float
acc_float += in_f * wei_f;
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
if(wi >= 0 && wi < Wi)
{
in_op(in_val, in_gnch[wi]);
wei_op(wei_val, wei_gkcy[x]);
acc += type_convert<float>(in_val) * type_convert<float>(wei_val);
}
}
}
}
}
OutDataType result = type_convert<OutDataType>(acc);
out_op(out_val, result);
p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + ho * out_stride_h + wo] =
out_val;
}
}
else if constexpr(NDimSpatial == 3)
{
const long_index_t num_out = G * N * K * Do * Ho * Wo;
const long_index_t in_stride_g = N * C * Di * Hi * Wi;
const long_index_t in_stride_n = C * Di * Hi * Wi;
const long_index_t in_stride_c = Di * Hi * Wi;
const long_index_t in_stride_d = Hi * Wi;
const long_index_t in_stride_h = Wi;
const long_index_t wei_stride_g = K * C * Z * Y * X;
const long_index_t wei_stride_k = C * Z * Y * X;
const long_index_t wei_stride_c = Z * Y * X;
const long_index_t wei_stride_z = Y * X;
const long_index_t wei_stride_y = X;
const long_index_t out_stride_g = N * K * Do * Ho * Wo;
const long_index_t out_stride_n = K * Do * Ho * Wo;
const long_index_t out_stride_k = Do * Ho * Wo;
const long_index_t out_stride_d = Ho * Wo;
const long_index_t out_stride_h = Wo;
// Convert float accumulator to TAcc, then to output type
TAcc acc = type_convert<TAcc>(acc_float);
TOut result = type_convert<TOut>(acc);
for(long_index_t idx = tid; idx < num_out; idx += num_threads)
{
index_t remaining = idx;
const index_t wo = remaining % Wo;
remaining /= Wo;
const index_t ho = remaining % Ho;
remaining /= Ho;
const index_t do_idx = remaining % Do;
remaining /= Do;
const index_t k = remaining % K;
remaining /= K;
const index_t n = remaining % N;
const index_t g = remaining / N;
// Apply output element-wise operation (if any)
out_op(out_val, result);
float acc = 0.0f;
const InDataType* in_gn = p_in + g * in_stride_g + n * in_stride_n;
const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
// Write transformed result
p_out[ii] = out_val;
for(index_t c = 0; c < C; ++c)
{
const InDataType* in_gnc = in_gn + c * in_stride_c;
const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
for(index_t z = 0; z < Z; ++z)
{
long_index_t di = do_idx * stride_z + z * dilation_z - pad_z;
if(di >= 0 && di < Di)
{
const InDataType* in_gncd = in_gnc + di * in_stride_d;
const WeiDataType* wei_gkcz = wei_gkc + z * wei_stride_z;
for(index_t y = 0; y < Y; ++y)
{
long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
if(hi >= 0 && hi < Hi)
{
const InDataType* in_gncdh = in_gncd + hi * in_stride_h;
const WeiDataType* wei_gkczy = wei_gkcz + y * wei_stride_y;
for(index_t x = 0; x < X; ++x)
{
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
if(wi >= 0 && wi < Wi)
{
in_op(in_val, in_gncdh[wi]);
wei_op(wei_val, wei_gkczy[x]);
acc += type_convert<float>(in_val) *
type_convert<float>(wei_val);
}
}
}
}
}
}
}
OutDataType result = type_convert<OutDataType>(acc);
out_op(out_val, result);
p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + do_idx * out_stride_d +
ho * out_stride_h + wo] = out_val;
}
}
}
// GPU reference convolution - takes ConvParam directly
template <typename InLayout,
typename WeiLayout,
typename OutLayout,
typename TIn,
typename TWei,
typename TOut,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation>
void naive_conv_fwd(const TIn* p_in,
const TWei* p_wei,
TOut* p_out,
const ck::utils::conv::ConvParam& conv_param,
InElementwiseOperation in_element_op = InElementwiseOperation{},
WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
OutElementwiseOperation out_element_op = OutElementwiseOperation{},
hipStream_t stream = nullptr)
{
const auto ndim = conv_param.num_dim_spatial_;
const index_t G = conv_param.G_;
const index_t N = conv_param.N_;
const index_t C = conv_param.C_;
const index_t K = conv_param.K_;
std::vector<index_t> in_lengths = {G, N, C};
std::vector<index_t> wei_lengths = {G, K, C};
std::vector<index_t> out_lengths = {G, N, K};
for(index_t i = 0; i < ndim; ++i)
{
in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
}
// Calculate total elements for buffer allocation
long_index_t in_total = 1, wei_total = 1, out_total = 1;
for(auto l : in_lengths)
in_total *= l;
for(auto l : wei_lengths)
wei_total *= l;
for(auto l : out_lengths)
out_total *= l;
// Allocate packed buffers
SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
SimpleDeviceMem wei_packed_buf(wei_total * sizeof(TWei));
SimpleDeviceMem out_packed_buf(out_total * sizeof(TOut));
TIn* p_in_packed = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
TWei* p_wei_packed = static_cast<TWei*>(wei_packed_buf.GetDeviceBuffer());
TOut* p_out_packed = static_cast<TOut*>(out_packed_buf.GetDeviceBuffer());
// Compute strides and allocate device arrays for pack/unpack
std::vector<index_t> in_strides = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
const size_t dim_count = in_lengths.size();
SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
index_t* d_in_lengths = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
index_t* d_in_strides = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
HIP_CHECK_ERROR(hipMemcpy(
d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
HIP_CHECK_ERROR(hipMemcpy(
d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
// Pack input and weight tensors to contiguous layout
constexpr int block_size = 256;
strided_copy_kernel<TIn, false>
<<<(in_total + block_size - 1) / block_size, block_size, 0, stream>>>(
p_in, p_in_packed, d_in_lengths, d_in_strides, dim_count, in_total);
strided_copy_kernel<TWei, false>
<<<(wei_total + block_size - 1) / block_size, block_size, 0, stream>>>(
p_wei, p_wei_packed, d_wei_lengths, d_wei_strides, dim_count, wei_total);
// Build conv parameter vectors for kernel invocation
std::vector<index_t> conv_strides(ndim);
std::vector<index_t> conv_dilations(ndim);
std::vector<index_t> input_pads(ndim);
for(index_t i = 0; i < ndim; ++i)
{
conv_strides[i] = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
input_pads[i] = static_cast<index_t>(conv_param.input_left_pads_[i]);
}
// Run convolution kernel on packed data
const int out_grid = (out_total + block_size - 1) / block_size;
if(ndim == 1)
{
naive_conv_fwd_packed<1,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<out_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_packed,
p_out_packed,
G,
N,
K,
C,
1,
1,
in_lengths[3],
1,
1,
wei_lengths[3],
1,
1,
out_lengths[3],
1,
1,
conv_strides[0],
1,
1,
conv_dilations[0],
0,
0,
input_pads[0],
in_element_op,
wei_element_op,
out_element_op);
}
else if(ndim == 2)
{
naive_conv_fwd_packed<2,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<out_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_packed,
p_out_packed,
G,
N,
K,
C,
1,
in_lengths[3],
in_lengths[4],
1,
wei_lengths[3],
wei_lengths[4],
1,
out_lengths[3],
out_lengths[4],
1,
conv_strides[0],
conv_strides[1],
1,
conv_dilations[0],
conv_dilations[1],
0,
input_pads[0],
input_pads[1],
in_element_op,
wei_element_op,
out_element_op);
}
else // 3D
{
naive_conv_fwd_packed<3,
TIn,
TWei,
TOut,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
<<<out_grid, block_size, 0, stream>>>(p_in_packed,
p_wei_packed,
p_out_packed,
G,
N,
K,
C,
in_lengths[3],
in_lengths[4],
in_lengths[5],
wei_lengths[3],
wei_lengths[4],
wei_lengths[5],
out_lengths[3],
out_lengths[4],
out_lengths[5],
conv_strides[0],
conv_strides[1],
conv_strides[2],
conv_dilations[0],
conv_dilations[1],
conv_dilations[2],
input_pads[0],
input_pads[1],
input_pads[2],
in_element_op,
wei_element_op,
out_element_op);
}
// Unpack
strided_copy_kernel<TOut, true><<<out_grid, block_size, 0, stream>>>(
p_out_packed, p_out, d_out_lengths, d_out_strides, dim_count, out_total);
HIP_CHECK_ERROR(hipGetLastError());
// Memory automatically freed by SimpleDeviceMem destructors
}
} // namespace ref
} // namespace ck

View File

@@ -0,0 +1,177 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck/ck.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include <hip/hip_runtime.h>
#include <vector>
namespace ck {
namespace ref {
// RAII wrapper for device memory to prevent leaks
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&p_mem_), mem_size));
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
// Helper function to map layout dimension character to index in lengths array
// lengths array structure: [G, N/K, C/K, spatial...]
inline int map_dim_char_to_index(char dim_char, index_t ndim_spatial, bool is_weight)
{
// G dimension
if(dim_char == 'G')
return 0;
// Batch/output channels dimension (N for input/output, K for weight's first non-G dim)
if(dim_char == 'N')
return 1;
if(dim_char == 'K' && is_weight)
return 1;
// Channel dimension (C for input/weight, K for output)
if(dim_char == 'C')
return 2;
if(dim_char == 'K' && !is_weight)
return 2;
// Spatial dimensions - map based on ndim_spatial
// Input/Output use: D/H/W, Weight uses: Z/Y/X
if(ndim_spatial == 1)
{
if(dim_char == 'W' || dim_char == 'X')
return 3;
}
else if(ndim_spatial == 2)
{
if(dim_char == 'H' || dim_char == 'Y')
return 3;
if(dim_char == 'W' || dim_char == 'X')
return 4;
}
else if(ndim_spatial == 3)
{
if(dim_char == 'D' || dim_char == 'Z')
return 3;
if(dim_char == 'H' || dim_char == 'Y')
return 4;
if(dim_char == 'W' || dim_char == 'X')
return 5;
}
// Should not reach here
return -1;
}
// Template function to compute layout-aware strides based on layout name
// The layout name directly encodes memory ordering from left to right
template <typename Layout>
inline std::vector<index_t> compute_conv_tensor_strides(const std::vector<index_t>& lengths,
index_t ndim_spatial)
{
constexpr const char* layout_name = Layout::name;
const int num_dims = static_cast<int>(lengths.size());
std::vector<index_t> strides(num_dims, 0);
// Determine if this is a weight tensor (has 'K' but not 'N')
bool has_k = false;
bool has_n = false;
for(const char* p = layout_name; *p != '\0'; ++p)
{
if(*p == 'K')
has_k = true;
if(*p == 'N')
has_n = true;
}
bool is_weight = has_k && !has_n;
// Build dimension ordering from layout name (parse string)
std::vector<char> dim_order;
const char dim_chars[] = {'G', 'N', 'K', 'C', 'D', 'H', 'W', 'X', 'Y', 'Z'};
for(const char* p = layout_name; *p != '\0'; ++p)
{
char c = *p;
// Skip underscores (strided layouts)
if(c == '_')
continue;
// Valid dimension characters
if(std::find(std::begin(dim_chars), std::end(dim_chars), c) != std::end(dim_chars))
{
dim_order.push_back(c);
}
}
// Compute strides: process from right to left (innermost to outermost)
index_t stride = 1;
for(int i = static_cast<int>(dim_order.size()) - 1; i >= 0; --i)
{
char dim_char = dim_order[i];
int length_idx = map_dim_char_to_index(dim_char, ndim_spatial, is_weight);
if(length_idx >= 0 && length_idx < num_dims)
{
strides[length_idx] = stride;
stride *= lengths[length_idx];
}
}
return strides;
}
// Unified kernel for strided tensor copy operations
// IsUnpack=false: Pack strided -> contiguous
// IsUnpack=true: Unpack contiguous -> strided
template <typename DataType, bool IsUnpack>
__global__ void strided_copy_kernel(const DataType* __restrict__ src,
DataType* __restrict__ dst,
const index_t* tensor_lengths,
const index_t* strided_strides,
int num_dims,
long_index_t total_elements)
{
const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const long_index_t num_threads = blockDim.x * gridDim.x;
for(long_index_t linear_idx = tid; linear_idx < total_elements; linear_idx += num_threads)
{
// Compute strided index from linear index
long_index_t remaining = linear_idx;
long_index_t strided_idx = 0;
for(int dim = num_dims - 1; dim >= 0; --dim)
{
index_t coord = remaining % tensor_lengths[dim];
remaining /= tensor_lengths[dim];
strided_idx += coord * strided_strides[dim];
}
// Direction determines which is src and which is dst
if constexpr(IsUnpack)
{
// Unpack: src is contiguous (linear_idx), dst is strided (strided_idx)
dst[strided_idx] = src[linear_idx];
}
else
{
// Pack: src is strided (strided_idx), dst is contiguous (linear_idx)
dst[linear_idx] = src[strided_idx];
}
}
}
} // namespace ref
} // namespace ck

View File

@@ -18,6 +18,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
namespace ck {
@@ -89,8 +90,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
wei_device_buf.ToDevice(wei.mData.data());
float max_accumulated_value = 0;
if(do_verification)
if(do_verification == 2)
{
// Use GPU reference for verification
std::cout << "Using GPU reference for verification" << std::endl;
// Allocate GPU reference output buffer
DeviceMem gpu_ref_in_buf(sizeof(InDataType) * in_host.mDesc.GetElementSpaceSize());
// Call GPU reference with ConvParam directly
ref::naive_conv_bwd_data<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<InDataType*>(gpu_ref_in_buf.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);
// Copy GPU reference result to host for comparison
gpu_ref_in_buf.FromDevice(in_host.mData.data());
max_accumulated_value = *std::max_element(in_host.mData.begin(), in_host.mData.end());
}
else if(do_verification == 1)
{
// Use CPU reference for verification (default)
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
InDataType,
WeiDataType,

View File

@@ -23,6 +23,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
namespace ck {
namespace profiler {
@@ -93,29 +94,69 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
float max_accumulated_value = 0;
if(do_verification)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weight_host_result,
output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op,
{},
{},
{});
if(do_verification == 1)
{
// CPU reference
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weight_host_result,
output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op,
{},
{},
{});
ref_invoker.Run(ref_argument);
}
else if(do_verification == 2)
{
// GPU reference
std::cout << "Running GPU reference implementation..." << std::endl;
// Allocate device memory for reference
DeviceMem in_ref_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_ref_buf(sizeof(WeiDataType) *
weight_host_result.mDesc.GetElementSpaceSize());
DeviceMem out_ref_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
in_ref_buf.ToDevice(input.mData.data());
out_ref_buf.ToDevice(output.mData.data());
// Call GPU reference with ConvParam directly
ck::ref::naive_conv_bwd_weight<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
static_cast<const InDataType*>(in_ref_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_ref_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(out_ref_buf.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);
// Copy result back to host
wei_ref_buf.FromDevice(weight_host_result.mData.data());
}
ref_invoker.Run(ref_argument);
max_accumulated_value =
*std::max_element(weight_host_result.mData.begin(), weight_host_result.mData.end());
}

View File

@@ -22,6 +22,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
namespace ck {
namespace profiler {
@@ -113,8 +114,38 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
wei_device_buf.ToDevice(weight.mData.data());
// run reference op
if(do_verification)
if(do_verification == 2)
{
// Use GPU reference for verification
std::cout << "Using GPU reference for verification" << std::endl;
// Allocate GPU reference output buffer
DeviceMem gpu_ref_out_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
// Call GPU reference with ConvParam directly
ref::naive_conv_fwd<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
reinterpret_cast<OutDataType*>(gpu_ref_out_buf.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);
// Copy GPU reference result to host for comparison
gpu_ref_out_buf.FromDevice(host_output.mData.data());
}
else if(do_verification == 1)
{
// Use CPU reference for verification (default)
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,

View File

@@ -311,4 +311,5 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx12")
endif()
add_subdirectory(position_embedding)
add_subdirectory(scatter_gather)
add_subdirectory(gpu_reference)
add_subdirectory(util)

View File

@@ -5,3 +5,8 @@ add_gtest_executable(test_convnd_fwd convnd_fwd_xdl.cpp)
if(result EQUAL 0)
target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
endif()
add_gtest_executable(test_convnd_fwd_naive convnd_fwd_naive.cpp)
if(result EQUAL 0)
target_link_libraries(test_convnd_fwd_naive PRIVATE utility)
endif()

View File

@@ -0,0 +1,220 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <cstdlib>
#include <iostream>
#include <vector>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using InDataType = float;
using WeiDataType = float;
using OutDataType = float;
using AccDataType = float;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using DeviceConvNaive = ck::tensor_operation::device::
DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<InDataType,
WeiDataType,
OutDataType,
AccDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
template <ck::index_t NDimSpatial>
bool run_conv3d_naive_test(const ck::utils::conv::ConvParam& conv_param)
{
using namespace ck;
using namespace ck::tensor_operation::host;
using InLayout = ck::tensor_layout::convolution::GNCDHW;
using WeiLayout = ck::tensor_layout::convolution::GKCZYX;
using OutLayout = ck::tensor_layout::convolution::GNKDHW;
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
Tensor<InDataType> in(in_g_n_c_wis_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
// Initialize tensors
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
// Run device kernel - convert long_index_t vectors to index_t
std::vector<ck::index_t> input_spatial_lengths(conv_param.input_spatial_lengths_.begin(),
conv_param.input_spatial_lengths_.end());
std::vector<ck::index_t> filter_spatial_lengths(conv_param.filter_spatial_lengths_.begin(),
conv_param.filter_spatial_lengths_.end());
auto output_spatial_lengths_long = conv_param.GetOutputSpatialLengths();
std::vector<ck::index_t> output_spatial_lengths(output_spatial_lengths_long.begin(),
output_spatial_lengths_long.end());
std::vector<ck::index_t> conv_filter_strides(conv_param.conv_filter_strides_.begin(),
conv_param.conv_filter_strides_.end());
std::vector<ck::index_t> conv_filter_dilations(conv_param.conv_filter_dilations_.begin(),
conv_param.conv_filter_dilations_.end());
std::vector<ck::index_t> input_left_pads(conv_param.input_left_pads_.begin(),
conv_param.input_left_pads_.end());
std::vector<ck::index_t> input_right_pads(conv_param.input_right_pads_.begin(),
conv_param.input_right_pads_.end());
auto conv = DeviceConvNaive{};
auto invoker = conv.MakeInvoker();
auto argument =
conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.N_,
conv_param.K_,
conv_param.C_,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
std::cout << "Unsupported argument for naive conv3d kernel" << std::endl;
return false;
}
invoker.Run(argument, StreamConfig{nullptr, false});
// Run CPU reference
auto ref_conv = ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
0,
0,
0,
AccDataType>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in,
wei,
out_host,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
// Compare results
out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-3, 1e-3);
}
TEST(TestConv3dNaive, Conv3dNaive_Small)
{
// Small 3D convolution test
ck::utils::conv::ConvParam param{
3, // spatial_dim
1, // G
2, // N
16, // K
16, // C
{3, 3, 3}, // filter
{7, 7, 7}, // input spatial
{2, 2, 2}, // strides
{1, 1, 1}, // dilations
{1, 1, 1}, // left pads
{1, 1, 1} // right pads
};
bool pass = run_conv3d_naive_test<3>(param);
EXPECT_TRUE(pass);
}
TEST(TestConv3dNaive, Conv3dNaive_Medium)
{
// Medium size 3D convolution test
ck::utils::conv::ConvParam param{
3, // spatial_dim
1, // G
4, // N
32, // K
32, // C
{3, 3, 3}, // filter
{14, 14, 14}, // input spatial
{1, 1, 1}, // strides
{1, 1, 1}, // dilations
{1, 1, 1}, // left pads
{1, 1, 1} // right pads
};
bool pass = run_conv3d_naive_test<3>(param);
EXPECT_TRUE(pass);
}
TEST(TestConv3dNaive, Conv3dNaive_UnitFilter)
{
// 1x1x1 filter (no padding)
ck::utils::conv::ConvParam param{
3, // spatial_dim
1, // G
2, // N
24, // K
24, // C
{1, 1, 1}, // filter
{8, 8, 8}, // input spatial
{1, 1, 1}, // strides
{1, 1, 1}, // dilations
{0, 0, 0}, // left pads
{0, 0, 0} // right pads
};
bool pass = run_conv3d_naive_test<3>(param);
EXPECT_TRUE(pass);
}
int main(int argc, char** argv)
{
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

View File

@@ -0,0 +1,11 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
add_gtest_executable(test_gpu_reference_conv_fwd test_gpu_reference_conv_fwd.cpp)
target_link_libraries(test_gpu_reference_conv_fwd PRIVATE utility)
add_gtest_executable(test_gpu_reference_conv_bwd_data test_gpu_reference_conv_bwd_data.cpp)
target_link_libraries(test_gpu_reference_conv_bwd_data PRIVATE utility)
add_gtest_executable(test_gpu_reference_conv_bwd_weight test_gpu_reference_conv_bwd_weight.cpp)
target_link_libraries(test_gpu_reference_conv_bwd_weight PRIVATE utility)

View File

@@ -0,0 +1,137 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck/ck.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include <vector>
namespace ck {
namespace test {
// Common test shapes for all convolution tests (fwd, bwd_data, bwd_weight)
namespace conv_test_shapes {
// 2D Conv, FP16, Small
inline ck::utils::conv::ConvParam get_2d_small()
{
return ck::utils::conv::ConvParam(2, // num_dim_spatial
1, // G
2, // N
8, // K
8, // C
{3, 3}, // filter_spatial
{7, 7}, // input_spatial
{1, 1}, // strides
{1, 1}, // dilations
{0, 0}, // left_pads
{0, 0} // right_pads
);
}
// 2D Conv, FP32, Medium
inline ck::utils::conv::ConvParam get_2d_medium()
{
return ck::utils::conv::ConvParam(2, // num_dim_spatial
1, // G
4, // N
16, // K
16, // C
{3, 3}, // filter_spatial
{14, 14}, // input_spatial
{1, 1}, // strides
{1, 1}, // dilations
{0, 0}, // left_pads
{0, 0} // right_pads
);
}
// 1D Conv, FP16
inline ck::utils::conv::ConvParam get_1d()
{
return ck::utils::conv::ConvParam(1, // num_dim_spatial
1, // G
2, // N
8, // K
8, // C
{3}, // filter_spatial
{16}, // input_spatial
{1}, // strides
{1}, // dilations
{0}, // left_pads
{0} // right_pads
);
}
// 3D Conv, FP16, Small
inline ck::utils::conv::ConvParam get_3d_small()
{
return ck::utils::conv::ConvParam(3, // num_dim_spatial
1, // G
1, // N
8, // K
8, // C
{3, 3, 3}, // filter_spatial
{5, 5, 5}, // input_spatial
{1, 1, 1}, // strides
{1, 1, 1}, // dilations
{0, 0, 0}, // left_pads
{0, 0, 0} // right_pads
);
}
// 2D Conv with stride
inline ck::utils::conv::ConvParam get_2d_stride2()
{
return ck::utils::conv::ConvParam(2, // num_dim_spatial
1, // G
2, // N
8, // K
8, // C
{3, 3}, // filter_spatial
{8, 8}, // input_spatial
{2, 2}, // strides
{1, 1}, // dilations
{0, 0}, // left_pads
{0, 0} // right_pads
);
}
// 2D Grouped Conv, FP16, G=2
inline ck::utils::conv::ConvParam get_2d_grouped_g2()
{
return ck::utils::conv::ConvParam(2, // num_dim_spatial
2, // G
2, // N
8, // K (8 total output channels)
16, // C (16 total input channels, 8 per group with G=2)
{3, 3}, // filter_spatial
{7, 7}, // input_spatial
{1, 1}, // strides
{1, 1}, // dilations
{0, 0}, // left_pads
{0, 0} // right_pads
);
}
// 2D Grouped Conv, FP32, G=4
inline ck::utils::conv::ConvParam get_2d_grouped_g4()
{
return ck::utils::conv::ConvParam(2, // num_dim_spatial
4, // G
1, // N
16, // K (16 total output channels)
16, // C (16 total input channels, 4 per group with G=4)
{3, 3}, // filter_spatial
{8, 8}, // input_spatial
{1, 1}, // strides
{1, 1}, // dilations
{0, 0}, // left_pads
{0, 0} // right_pads
);
}
} // namespace conv_test_shapes
} // namespace test
} // namespace ck

View File

@@ -0,0 +1,385 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck/ck.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
// CPU references
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
// GPU references
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "common_test_params.hpp"
namespace ck {
namespace test {
enum class ConvKernelType
{
Forward,
BackwardData,
BackwardWeight
};
// Helper function to initialize and copy a tensor to device
template <typename DataType>
void initialize_and_copy_tensor(Tensor<DataType>& host_tensor, DeviceMem& device_mem)
{
host_tensor.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
device_mem.ToDevice(host_tensor.mData.data());
}
// Helper to get default layout types based on NDimSpatial
template <index_t NDimSpatial>
struct DefaultConvLayouts
{
using InLayout = std::conditional_t<NDimSpatial == 3,
tensor_layout::convolution::GNCDHW,
std::conditional_t<NDimSpatial == 2,
tensor_layout::convolution::GNCHW,
tensor_layout::convolution::GNCW>>;
using WeiLayout = std::conditional_t<NDimSpatial == 3,
tensor_layout::convolution::GKCZYX,
std::conditional_t<NDimSpatial == 2,
tensor_layout::convolution::GKCYX,
tensor_layout::convolution::GKCX>>;
using OutLayout = std::conditional_t<NDimSpatial == 3,
tensor_layout::convolution::GNKDHW,
std::conditional_t<NDimSpatial == 2,
tensor_layout::convolution::GNKHW,
tensor_layout::convolution::GNKW>>;
};
// Forward convolution implementation
template <index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
bool test_conv_fwd_impl(const ck::utils::conv::ConvParam& params,
const Tensor<InDataType>& input_cpu,
const Tensor<WeiDataType>& weight_cpu,
DeviceMem& input_dev,
DeviceMem& weight_dev,
DeviceMem& output_dev)
{
using InElementOp = tensor_operation::element_wise::PassThrough;
using WeiElementOp = tensor_operation::element_wise::PassThrough;
using OutElementOp = tensor_operation::element_wise::PassThrough;
// Call GPU reference with ConvParam directly
ref::naive_conv_fwd<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
params);
HIP_CHECK_ERROR(hipDeviceSynchronize());
// Run CPU reference
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
params.conv_filter_strides_.end());
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
params.conv_filter_dilations_.end());
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
params.input_left_pads_.end());
Tensor<InDataType> input_ref = input_cpu;
Tensor<WeiDataType> weight_ref = weight_cpu;
Tensor<OutDataType> output_ref(
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params));
auto ref_conv = tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_arg = ref_conv.MakeArgument(input_ref,
weight_ref,
output_ref,
strides_long,
dilations_long,
pads_long,
pads_long,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_arg);
// Copy result from device and compare
Tensor<OutDataType> output_gpu(output_ref.mDesc);
output_dev.FromDevice(output_gpu.mData.data());
HIP_CHECK_ERROR(hipDeviceSynchronize());
// Compare results
return ck::utils::check_err(output_gpu, output_ref);
}
// Backward data convolution implementation
template <index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
bool test_conv_bwd_data_impl(const ck::utils::conv::ConvParam& params,
const Tensor<WeiDataType>& weight_cpu,
const Tensor<OutDataType>& output_cpu,
DeviceMem& weight_dev,
DeviceMem& output_dev,
DeviceMem& input_dev)
{
using InElementOp = tensor_operation::element_wise::PassThrough;
using WeiElementOp = tensor_operation::element_wise::PassThrough;
using OutElementOp = tensor_operation::element_wise::PassThrough;
// Call GPU reference with ConvParam directly
ref::naive_conv_bwd_data<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<InDataType*>(input_dev.GetDeviceBuffer()),
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(output_dev.GetDeviceBuffer()),
params);
HIP_CHECK_ERROR(hipDeviceSynchronize());
// Run CPU reference
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
params.conv_filter_strides_.end());
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
params.conv_filter_dilations_.end());
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
params.input_left_pads_.end());
Tensor<InDataType> input_ref(
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(params));
Tensor<WeiDataType> weight_ref = weight_cpu;
Tensor<OutDataType> output_ref = output_cpu;
auto ref_conv = tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_arg = ref_conv.MakeArgument(input_ref,
weight_ref,
output_ref,
strides_long,
dilations_long,
pads_long,
pads_long,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_arg);
// Copy result from device and compare
Tensor<InDataType> input_gpu(input_ref.mDesc);
input_dev.FromDevice(input_gpu.mData.data());
HIP_CHECK_ERROR(hipDeviceSynchronize());
// Compare results
return ck::utils::check_err(input_gpu, input_ref);
}
// Backward weight convolution implementation
template <index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
bool test_conv_bwd_weight_impl(const ck::utils::conv::ConvParam& params,
const Tensor<InDataType>& input_cpu,
const Tensor<OutDataType>& output_cpu,
DeviceMem& input_dev,
DeviceMem& output_dev,
DeviceMem& weight_dev)
{
using InElementOp = tensor_operation::element_wise::PassThrough;
using WeiElementOp = tensor_operation::element_wise::PassThrough;
using OutElementOp = tensor_operation::element_wise::PassThrough;
// Call GPU reference with ConvParam directly
ref::naive_conv_bwd_weight<InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>(
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
reinterpret_cast<WeiDataType*>(weight_dev.GetDeviceBuffer()),
reinterpret_cast<const OutDataType*>(output_dev.GetDeviceBuffer()),
params);
HIP_CHECK_ERROR(hipDeviceSynchronize());
// Run CPU reference
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
params.conv_filter_strides_.end());
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
params.conv_filter_dilations_.end());
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
params.input_left_pads_.end());
Tensor<InDataType> input_ref = input_cpu;
Tensor<WeiDataType> weight_ref(
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(params));
Tensor<OutDataType> output_ref = output_cpu;
auto ref_conv = tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_arg = ref_conv.MakeArgument(input_ref,
weight_ref,
output_ref,
strides_long,
dilations_long,
pads_long,
pads_long,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_arg);
// Copy result from device and compare
Tensor<WeiDataType> weight_gpu(weight_ref.mDesc);
weight_dev.FromDevice(weight_gpu.mData.data());
HIP_CHECK_ERROR(hipDeviceSynchronize());
// Compare results
return ck::utils::check_err(weight_gpu, weight_ref);
}
// Main test function - dispatches to specific implementations
template <index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout = typename DefaultConvLayouts<NDimSpatial>::InLayout,
typename WeiLayout = typename DefaultConvLayouts<NDimSpatial>::WeiLayout,
typename OutLayout = typename DefaultConvLayouts<NDimSpatial>::OutLayout>
bool test_conv_gpu_ref(const ck::utils::conv::ConvParam& params, ConvKernelType kernel_type)
{
// Create tensor descriptors using the specified layouts
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(params);
const auto wei_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(params);
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params);
// Create tensors using tensor descriptors (supports multiple layouts)
Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
Tensor<OutDataType> output(out_g_n_k_wos_desc);
// Allocate device memory
DeviceMem input_dev(input.mData.size() * sizeof(InDataType));
DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType));
DeviceMem output_dev(output.mData.size() * sizeof(OutDataType));
// Initialize and copy tensors based on kernel type
if(kernel_type == ConvKernelType::Forward)
{
initialize_and_copy_tensor(input, input_dev);
initialize_and_copy_tensor(weight, weight_dev);
}
else if(kernel_type == ConvKernelType::BackwardData)
{
initialize_and_copy_tensor(weight, weight_dev);
initialize_and_copy_tensor(output, output_dev);
}
else // BackwardWeight
{
initialize_and_copy_tensor(input, input_dev);
initialize_and_copy_tensor(output, output_dev);
}
// Dispatch to appropriate implementation with layout types
if(kernel_type == ConvKernelType::Forward)
{
return test_conv_fwd_impl<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InLayout,
WeiLayout,
OutLayout>(
params, input, weight, input_dev, weight_dev, output_dev);
}
else if(kernel_type == ConvKernelType::BackwardData)
{
return test_conv_bwd_data_impl<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InLayout,
WeiLayout,
OutLayout>(
params, weight, output, weight_dev, output_dev, input_dev);
}
else // BackwardWeight
{
return test_conv_bwd_weight_impl<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InLayout,
WeiLayout,
OutLayout>(
params, input, output, input_dev, output_dev, weight_dev);
}
}
} // namespace test
} // namespace ck

View File

@@ -0,0 +1,224 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "gpu_reference_utils.hpp"
using namespace ck;
using ck::test::ConvKernelType;
TEST(GpuReferenceConvBwdData, Conv2DFP16Small)
{
auto params = test::conv_test_shapes::get_2d_small();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP32Medium)
{
auto params = test::conv_test_shapes::get_2d_medium();
bool result =
test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv1DFP16)
{
auto params = test::conv_test_shapes::get_1d();
bool result =
test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv3DFP16Small)
{
auto params = test::conv_test_shapes::get_3d_small();
bool result =
test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP16Stride2)
{
auto params = test::conv_test_shapes::get_2d_stride2();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedG2)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedG4)
{
auto params = test::conv_test_shapes::get_2d_grouped_g4();
bool result =
test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NHWGC,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NHWGK>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NHWGC,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NHWGK>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NGKHW>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NGKHW>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKCYX,
tensor_layout::convolution::NGKHW>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKCYX,
tensor_layout::convolution::NGKHW>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
float,
float,
float,
tensor_layout::convolution::NDHWGC,
tensor_layout::convolution::GKZYXC,
tensor_layout::convolution::NDHWGK>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
half_t,
half_t,
half_t,
tensor_layout::convolution::NDHWGC,
tensor_layout::convolution::GKZYXC,
tensor_layout::convolution::NDHWGK>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
float,
float,
float,
tensor_layout::convolution::NGCDHW,
tensor_layout::convolution::GKCZYX,
tensor_layout::convolution::NGKDHW>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdData, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCDHW,
tensor_layout::convolution::GKCZYX,
tensor_layout::convolution::NGKDHW>(
params, ConvKernelType::BackwardData);
EXPECT_TRUE(result);
}

View File

@@ -0,0 +1,224 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "gpu_reference_utils.hpp"
using namespace ck;
using ck::test::ConvKernelType;
TEST(GpuReferenceConvBwdWeight, Conv2DFP16Small)
{
auto params = test::conv_test_shapes::get_2d_small();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP32Medium)
{
auto params = test::conv_test_shapes::get_2d_medium();
bool result =
test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv1DFP16)
{
auto params = test::conv_test_shapes::get_1d();
bool result =
test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv3DFP16Small)
{
auto params = test::conv_test_shapes::get_3d_small();
bool result =
test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP16Stride2)
{
auto params = test::conv_test_shapes::get_2d_stride2();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedG2)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedG4)
{
auto params = test::conv_test_shapes::get_2d_grouped_g4();
bool result =
test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NHWGC,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NHWGK>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NHWGC,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NHWGK>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NGKHW>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NGKHW>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKCYX,
tensor_layout::convolution::NGKHW>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result = test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKCYX,
tensor_layout::convolution::NGKHW>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
float,
float,
float,
tensor_layout::convolution::NDHWGC,
tensor_layout::convolution::GKZYXC,
tensor_layout::convolution::NDHWGK>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
half_t,
half_t,
half_t,
tensor_layout::convolution::NDHWGC,
tensor_layout::convolution::GKZYXC,
tensor_layout::convolution::NDHWGK>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
float,
float,
float,
tensor_layout::convolution::NGCDHW,
tensor_layout::convolution::GKCZYX,
tensor_layout::convolution::NGKDHW>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvBwdWeight, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCDHW,
tensor_layout::convolution::GKCZYX,
tensor_layout::convolution::NGKDHW>(
params, ConvKernelType::BackwardWeight);
EXPECT_TRUE(result);
}

View File

@@ -0,0 +1,222 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <gtest/gtest.h>
#include "gpu_reference_utils.hpp"
using namespace ck;
using ck::test::ConvKernelType;
TEST(GpuReferenceConvFwd, Conv2DFP16Small)
{
auto params = test::conv_test_shapes::get_2d_small();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP32Medium)
{
auto params = test::conv_test_shapes::get_2d_medium();
bool result = test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv1DFP16)
{
auto params = test::conv_test_shapes::get_1d();
bool result =
test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv3DFP16Small)
{
auto params = test::conv_test_shapes::get_3d_small();
bool result =
test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP16Stride2)
{
auto params = test::conv_test_shapes::get_2d_stride2();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP16GroupedG2)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP32GroupedG4)
{
auto params = test::conv_test_shapes::get_2d_grouped_g4();
bool result = test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NHWGC,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NHWGK>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NHWGC,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NHWGK>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKYXC,
tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2,
float,
float,
float,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKCYX,
tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
{
auto params = test::conv_test_shapes::get_2d_grouped_g2();
bool result =
test::test_conv_gpu_ref<2,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCHW,
tensor_layout::convolution::GKCYX,
tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
float,
float,
float,
tensor_layout::convolution::NDHWGC,
tensor_layout::convolution::GKZYXC,
tensor_layout::convolution::NDHWGK>(
params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
half_t,
half_t,
half_t,
tensor_layout::convolution::NDHWGC,
tensor_layout::convolution::GKZYXC,
tensor_layout::convolution::NDHWGK>(
params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
float,
float,
float,
tensor_layout::convolution::NGCDHW,
tensor_layout::convolution::GKCZYX,
tensor_layout::convolution::NGKDHW>(
params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}
TEST(GpuReferenceConvFwd, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
{
auto params = test::conv_test_shapes::get_3d_small();
// Modify to be grouped (G=2)
params.G_ = 2;
params.C_ = 16; // 8 per group
params.K_ = 16; // 8 per group
bool result = test::test_conv_gpu_ref<3,
half_t,
half_t,
half_t,
tensor_layout::convolution::NGCDHW,
tensor_layout::convolution::GKCZYX,
tensor_layout::convolution::NGKDHW>(
params, ConvKernelType::Forward);
EXPECT_TRUE(result);
}

View File

@@ -73,7 +73,7 @@ bool RunConvBwdDataTest(const ck::utils::conv::ConvParam& param, ck::index_t spl
InLayout,
DataType,
DataType,
DataType>(true, // do_verification
DataType>(2, // do_verification
1, // init_method
false, // do_log
false, // time_kernel

View File

@@ -47,7 +47,7 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test
DataType,
DataType,
DataType>(
true, // do_verification
2, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel

View File

@@ -73,7 +73,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
InDataType,
WeiDataType,
OutDataType>(
true, // do_verification
2, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel

View File

@@ -80,7 +80,7 @@ bool RunConvBwdWeightTest(const ck::utils::conv::ConvParam& param, ck::index_t s
InDataType,
WeiDataType,
OutDataType>(
true, // do_verification
2, // do_verification
1, // init_method
false, // do_log
false, // time_kernel

View File

@@ -46,7 +46,7 @@ class TestGroupedConvndFwd : public ::testing::Test
DataType,
DataType,
IndexType>(
true, // do_verification
2, // do_verification
1, // init_method: integer value
false, // do_log
false, // time_kernel

View File

@@ -77,7 +77,7 @@ bool RunConvTest(const ck::utils::conv::ConvParam& param)
DataType,
DataType,
DataType,
IndexType>(true, // do_verification
IndexType>(2, // do_verification
1, // init_method
false, // do_log
false, // time_kernel