diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp index 316dcadb7b..9d528bf737 100644 --- a/example/09_convnd_fwd/convnd_fwd_common.hpp +++ b/example/09_convnd_fwd/convnd_fwd_common.hpp @@ -131,6 +131,9 @@ template bool run_grouped_conv_fwd(int do_verification, int init_method, @@ -283,31 +286,25 @@ bool run_grouped_conv_fwd(int do_verification, DeviceMem out_device_ref_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); out_device_ref_buf.SetZero(); - // Extract dimensions using helper function - ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial); - - // Launch GPU reference kernel - constexpr ck::index_t block_size = 256; - const ck::long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K; - const ck::index_t grid_size = (output_length + block_size - 1) / block_size; - - auto gpu_ref_kernel = ck::ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk; - - gpu_ref_kernel<<>>( + // Call GPU reference with ConvParam directly, using the correct layout types + ck::ref::naive_conv_fwd( reinterpret_cast(in_device_buf.GetDeviceBuffer()), reinterpret_cast(wei_device_buf.GetDeviceBuffer()), reinterpret_cast(out_device_ref_buf.GetDeviceBuffer()), - dims); + conv_param); HIP_CHECK_ERROR(hipDeviceSynchronize()); - std::cout << "GPU reference kernel completed successfully, copying results..." << std::endl; + std::cout << "GPU reference function completed successfully, copying results..." + << std::endl; // Copy GPU reference result to host out_device_ref_buf.FromDevice(out_host.mData.data()); diff --git a/example/09_convnd_fwd/run_convnd_fwd_example.inc b/example/09_convnd_fwd/run_convnd_fwd_example.inc index 4c394821a8..33eddaf0d5 100644 --- a/example/09_convnd_fwd/run_convnd_fwd_example.inc +++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc @@ -12,7 +12,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) { print_helper_msg(); - int do_verification = 1; // 0=no, 1=CPU, 2=GPU + int do_verification = 2; // 0=no, 1=CPU, 2=GPU int init_method = 1; bool time_kernel = false; @@ -71,6 +71,9 @@ bool run_convnd_fwd_example(int argc, char* argv[]) WeiElementOp, OutElementOp, DeviceGroupedConvNDFwdInstance, + InLayout, + WeiLayout, + OutLayout, ComputeDataType>(do_verification, init_method, time_kernel, diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp index f278107c9e..c8f1b1459e 100644 --- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp +++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp @@ -18,7 +18,8 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp" #include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp" -#include "ck_tile/host/hip_check_error.hpp" +#include "ck/library/utility/algorithm.hpp" +#include "ck/host_utility/hip_check_error.hpp" using ::ck::DeviceMem; using ::ck::HostTensorDescriptor; @@ -81,7 +82,10 @@ template + typename DeviceConvNdBwdDataInstance, + typename InLayout, + typename WeiLayout, + typename OutLayout> int run_conv_bwd_data(int do_verification, int init_method, bool time_kernel, @@ -225,50 +229,52 @@ int run_conv_bwd_data(int do_verification, } else if(do_verification == 2) { - // GPU verification + // GPU verification using naive GPU reference std::cout << "Running GPU verification..." << std::endl; + // Allocate and ZERO GPU memory for reference input DeviceMem in_device_ref_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize()); in_device_ref_buf.SetZero(); - // Extract dimensions using helper function - ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial); - - constexpr ck::index_t block_size = 256; - const ck::long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C; - const ck::index_t grid_size = (input_length + block_size - 1) / block_size; - - auto gpu_ref_kernel = ck::ref::naive_conv_bwd_data_ndhwc_kzyxc_ndhwk; - - gpu_ref_kernel<<>>( + // Call GPU reference with ConvParam directly, using the correct layout types + ck::ref::naive_conv_bwd_data( reinterpret_cast(in_device_ref_buf.GetDeviceBuffer()), reinterpret_cast(wei_device_buf.GetDeviceBuffer()), reinterpret_cast(out_device_buf.GetDeviceBuffer()), - dims); + conv_param, + in_element_op, + wei_element_op, + out_element_op); HIP_CHECK_ERROR(hipDeviceSynchronize()); - std::cout << "GPU reference kernel completed, copying results..." << std::endl; + std::cout << "GPU reference function completed successfully, copying results..." + << std::endl; - // Copy GPU reference result + // Copy GPU reference result to host Tensor in_gpu_ref(in_host.mDesc); in_device_ref_buf.FromDevice(in_gpu_ref.mData.data()); - // Copy optimized kernel result + // Copy GPU kernel result to host in_device_buf.FromDevice(in_device.mData.data()); + std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl; + // Compare: Optimized kernel result vs GPU reference result bool pass = ck::utils::check_err(in_device, in_gpu_ref, "Error: Incorrect results!", get_rtol(), get_atol()); + std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl; return pass ? 0 : 1; diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp index eaae245db1..73aa7c50e3 100644 --- a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp +++ b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp @@ -92,16 +92,19 @@ int main(int argc, char* argv[]) InElementOp, WeiElementOp, OutElementOp, - DeviceConvNdBwdDataInstance<1>>(do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); + DeviceConvNdBwdDataInstance<1>, + InLayout, + WeiLayout, + OutLayout>(do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); } else if(conv_param.num_dim_spatial_ == 2) { @@ -128,16 +131,19 @@ int main(int argc, char* argv[]) InElementOp, WeiElementOp, OutElementOp, - DeviceConvNdBwdDataInstance<2>>(do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); + DeviceConvNdBwdDataInstance<2>, + InLayout, + WeiLayout, + OutLayout>(do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); } else if(conv_param.num_dim_spatial_ == 3) { @@ -164,16 +170,19 @@ int main(int argc, char* argv[]) InElementOp, WeiElementOp, OutElementOp, - DeviceConvNdBwdDataInstance<3>>(do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); + DeviceConvNdBwdDataInstance<3>, + InLayout, + WeiLayout, + OutLayout>(do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); } return 0; diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp index 3d1c70bea0..9f83620c94 100644 --- a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp +++ b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp @@ -119,16 +119,19 @@ int main(int argc, char* argv[]) InElementOp, WeiElementOp, OutElementOp, - DeviceConvNdBwdDataInstance<1>>(do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); + DeviceConvNdBwdDataInstance<1>, + InLayout, + WeiLayout, + OutLayout>(do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); } else if(conv_param.num_dim_spatial_ == 2) { @@ -155,16 +158,19 @@ int main(int argc, char* argv[]) InElementOp, WeiElementOp, OutElementOp, - DeviceConvNdBwdDataInstance<2>>(do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); + DeviceConvNdBwdDataInstance<2>, + InLayout, + WeiLayout, + OutLayout>(do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); } else if(conv_param.num_dim_spatial_ == 3) { @@ -191,16 +197,19 @@ int main(int argc, char* argv[]) InElementOp, WeiElementOp, OutElementOp, - DeviceConvNdBwdDataInstance<3>>(do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); + DeviceConvNdBwdDataInstance<3>, + InLayout, + WeiLayout, + OutLayout>(do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op, + out_element_op); } return 0; diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc index 8cc9f582eb..70c43b81b3 100644 --- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc +++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc @@ -149,55 +149,53 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, } else if(config.do_verification == 2) { - // GPU verification (only supports G=1, standard convolution) - if(conv_param.G_ != 1) - { - std::cout << "GPU verification only supports G=1 (standard convolution)" << std::endl; - std::cout << "Current G=" << conv_param.G_ << " not supported." << std::endl; - std::cout << "Use do_verification=1 for CPU verification with grouped convolution." - << std::endl; - return true; - } - - std::cout << "Running GPU verification (G=1)..." << std::endl; + // GPU verification using naive GPU reference + std::cout << "Running GPU verification..." << std::endl; + // Allocate and ZERO GPU memory for reference weights DeviceMem wei_device_ref_buf(sizeof(WeiDataType) * wei_device_result.mDesc.GetElementSpaceSize()); wei_device_ref_buf.SetZero(); - // Extract dimensions using helper function (G=1, standard convolution) - ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial, false); + // Call GPU reference function with ConvParam and layout types + using InLayout = InputLayout; + using WeiLayout = WeightLayout; + using OutLayout = OutputLayout; - constexpr ck::index_t block_size = 256; - const ck::long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C; - const ck::index_t grid_size = (weight_length + block_size - 1) / block_size; - - auto gpu_ref_kernel = ck::ref::naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk; - - gpu_ref_kernel<<>>( + ck::ref::naive_conv_bwd_weight( reinterpret_cast(in_device_buf.GetDeviceBuffer()), reinterpret_cast(wei_device_ref_buf.GetDeviceBuffer()), reinterpret_cast(out_device_buf.GetDeviceBuffer()), - dims); + conv_param); HIP_CHECK_ERROR(hipDeviceSynchronize()); - std::cout << "GPU reference kernel completed, copying results..." << std::endl; + std::cout << "GPU reference function completed successfully, copying results..." + << std::endl; + // Copy GPU reference result to host wei_device_ref_buf.FromDevice(wei_host_result.mData.data()); + + // Copy GPU kernel result to host wei_device_buf.FromDevice(wei_device_result.mData.data()); + std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl; + + // Compare: Optimized kernel result vs GPU reference result bool pass = ck::utils::check_err(wei_device_result.mData, wei_host_result.mData, "Error: Incorrect results!", get_rtol(), get_atol()); + std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl; return pass; diff --git a/example/test_old_ck_gpu_reference.cpp b/example/test_old_ck_gpu_reference.cpp deleted file mode 100644 index 9f12eaea4d..0000000000 --- a/example/test_old_ck_gpu_reference.cpp +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -// Standalone test program for Old CK GPU references -// Tests naive_conv_fwd (existing) and future backward ops - -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" - -// CPU reference for validation -#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" - -// GPU reference (OLD CK - already exists!) -#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" - -using namespace ck; - -template -struct ConvParams -{ - index_t N, K, C; - std::vector input_spatial; - std::vector filter_spatial; - std::vector output_spatial; - std::vector strides; - std::vector dilations; - std::vector pads; -}; - -template -bool test_conv_forward_gpu_ref(const ConvParams& params, const std::string& test_name) -{ - std::cout << "[TEST] " << test_name << std::endl; - - // Calculate dimensions - const index_t N = params.N; - const index_t K = params.K; - const index_t C = params.C; - - // Create tensor descriptors (NDHWC layout for old CK) - std::vector in_lengths = {N}; - for(auto d : params.input_spatial) - in_lengths.push_back(d); - in_lengths.push_back(C); - - std::vector wei_lengths = {K}; - for(auto d : params.filter_spatial) - wei_lengths.push_back(d); - wei_lengths.push_back(C); - - std::vector out_lengths = {N}; - for(auto d : params.output_spatial) - out_lengths.push_back(d); - out_lengths.push_back(K); - - // Create host tensors - Tensor input(in_lengths); - Tensor weight(wei_lengths); - Tensor output_gpu(out_lengths); - Tensor output_ref(out_lengths); - - // Initialize with random data - input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - weight.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - - // Allocate device memory - DeviceMem input_dev(input.mData.size() * sizeof(InDataType)); - DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType)); - DeviceMem output_dev(output_gpu.mData.size() * sizeof(OutDataType)); - - // Copy to device - input_dev.ToDevice(input.mData.data()); - weight_dev.ToDevice(weight.mData.data()); - - // Run CPU reference for validation - auto ref_conv = - tensor_operation::host::ReferenceConvFwd(); - - auto ref_invoker = ref_conv.MakeInvoker(); - auto ref_arg = ref_conv.MakeArgument(input.mData.data(), - weight.mData.data(), - output_ref.mData.data(), - N, - K, - C, - params.input_spatial, - params.filter_spatial, - params.output_spatial, - params.strides, - params.dilations, - params.pads, - params.pads, - {}, - {}, - {}); - - ref_invoker.Run(ref_arg); - - // Run GPU reference (OLD CK) - using InElementOp = tensor_operation::element_wise::PassThrough; - using WeiElementOp = tensor_operation::element_wise::PassThrough; - using OutElementOp = tensor_operation::element_wise::PassThrough; - - constexpr index_t block_size = 256; - - // Extract dimensions based on NDimSpatial - index_t Di = 1, Hi = 1, Wi = 1; - index_t Z = 1, Y = 1, X = 1; - index_t Do = 1, Ho = 1, Wo = 1; - index_t stride_z = 1, stride_y = 1, stride_x = 1; - index_t dilation_z = 1, dilation_y = 1, dilation_x = 1; - index_t pad_z = 0, pad_y = 0, pad_x = 0; - - if(NDimSpatial == 1) - { - Wi = params.input_spatial[0]; - X = params.filter_spatial[0]; - Wo = params.output_spatial[0]; - stride_x = params.strides[0]; - dilation_x = params.dilations[0]; - pad_x = params.pads[0]; - } - else if(NDimSpatial == 2) - { - Hi = params.input_spatial[0]; - Wi = params.input_spatial[1]; - Y = params.filter_spatial[0]; - X = params.filter_spatial[1]; - Ho = params.output_spatial[0]; - Wo = params.output_spatial[1]; - stride_y = params.strides[0]; - stride_x = params.strides[1]; - dilation_y = params.dilations[0]; - dilation_x = params.dilations[1]; - pad_y = params.pads[0]; - pad_x = params.pads[1]; - } - else if(NDimSpatial == 3) - { - Di = params.input_spatial[0]; - Hi = params.input_spatial[1]; - Wi = params.input_spatial[2]; - Z = params.filter_spatial[0]; - Y = params.filter_spatial[1]; - X = params.filter_spatial[2]; - Do = params.output_spatial[0]; - Ho = params.output_spatial[1]; - Wo = params.output_spatial[2]; - stride_z = params.strides[0]; - stride_y = params.strides[1]; - stride_x = params.strides[2]; - dilation_z = params.dilations[0]; - dilation_y = params.dilations[1]; - dilation_x = params.dilations[2]; - pad_z = params.pads[0]; - pad_y = params.pads[1]; - pad_x = params.pads[2]; - } - - // Launch GPU reference kernel - const long_index_t output_length = N * Do * Ho * Wo * K; - const index_t grid_size = (output_length + block_size - 1) / block_size; - - hipLaunchKernelGGL(ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk, - dim3(grid_size), - dim3(block_size), - 0, - nullptr, - reinterpret_cast(input_dev.GetDeviceBuffer()), - reinterpret_cast(weight_dev.GetDeviceBuffer()), - reinterpret_cast(output_dev.GetDeviceBuffer()), - N, - K, - C, - Di, - Hi, - Wi, - Z, - Y, - X, - Do, - Ho, - Wo, - stride_z, - stride_y, - stride_x, - dilation_z, - dilation_y, - dilation_x, - pad_z, - pad_y, - pad_x); - - hipDeviceSynchronize(); - - // Copy result back - output_dev.FromDevice(output_gpu.mData.data()); - - // Compare GPU ref vs CPU ref - bool pass = check_err(output_gpu.mData, output_ref.mData, "GPU vs CPU ref", 1e-3, 1e-3); - - std::cout << " Result: " << (pass ? "✅ PASS" : "❌ FAIL") << std::endl; - - return pass; -} - -int main(int argc, char* argv[]) -{ - std::cout << "========================================" << std::endl; - std::cout << "Old CK GPU Reference Test Program" << std::endl; - std::cout << "========================================" << std::endl; - std::cout << std::endl; - - int passed = 0; - int failed = 0; - - // Test 1: 2D Conv, FP16, Small - { - ConvParams<2> params; - params.N = 2; - params.K = 8; - params.C = 8; - params.input_spatial = {7, 7}; - params.filter_spatial = {3, 3}; - params.output_spatial = {5, 5}; - params.strides = {1, 1}; - params.dilations = {1, 1}; - params.pads = {0, 0}; - - if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Small")) - passed++; - else - failed++; - } - - // Test 2: 2D Conv, FP32, Medium - { - ConvParams<2> params; - params.N = 4; - params.K = 16; - params.C = 16; - params.input_spatial = {14, 14}; - params.filter_spatial = {3, 3}; - params.output_spatial = {12, 12}; - params.strides = {1, 1}; - params.dilations = {1, 1}; - params.pads = {0, 0}; - - if(test_conv_forward_gpu_ref<2, float, float, float>(params, "2D-FP32-Medium")) - passed++; - else - failed++; - } - - // Test 3: 1D Conv, FP16 - { - ConvParams<1> params; - params.N = 2; - params.K = 8; - params.C = 8; - params.input_spatial = {16}; - params.filter_spatial = {3}; - params.output_spatial = {14}; - params.strides = {1}; - params.dilations = {1}; - params.pads = {0}; - - if(test_conv_forward_gpu_ref<1, half_t, half_t, half_t>(params, "1D-FP16")) - passed++; - else - failed++; - } - - // Test 4: 3D Conv, FP16, Small - { - ConvParams<3> params; - params.N = 1; - params.K = 8; - params.C = 8; - params.input_spatial = {5, 5, 5}; - params.filter_spatial = {3, 3, 3}; - params.output_spatial = {3, 3, 3}; - params.strides = {1, 1, 1}; - params.dilations = {1, 1, 1}; - params.pads = {0, 0, 0}; - - if(test_conv_forward_gpu_ref<3, half_t, half_t, half_t>(params, "3D-FP16-Small")) - passed++; - else - failed++; - } - - // Test 5: 2D Conv with stride - { - ConvParams<2> params; - params.N = 2; - params.K = 8; - params.C = 8; - params.input_spatial = {8, 8}; - params.filter_spatial = {3, 3}; - params.output_spatial = {3, 3}; - params.strides = {2, 2}; - params.dilations = {1, 1}; - params.pads = {0, 0}; - - if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Stride2")) - passed++; - else - failed++; - } - - std::cout << std::endl; - std::cout << "========================================" << std::endl; - std::cout << "SUMMARY" << std::endl; - std::cout << "========================================" << std::endl; - std::cout << "Total: " << (passed + failed) << std::endl; - std::cout << "Passed: " << passed << " ✅" << std::endl; - std::cout << "Failed: " << failed << std::endl; - std::cout << std::endl; - - if(failed == 0) - { - std::cout << "🎉 ALL TESTS PASSED!" << std::endl; - std::cout << "Old CK Forward GPU Reference: WORKING ✅" << std::endl; - return 0; - } - else - { - std::cout << "❌ SOME TESTS FAILED" << std::endl; - return 1; - } -} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp index 2cdb70e2a2..a8e5b18ed8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp @@ -7,11 +7,12 @@ #include #include #include -#include "conv_util.hpp" -#include "device.hpp" -#include "device_conv_fwd.hpp" -#include "common_header.hpp" -#include "naive_conv_fwd_gpu.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/utility/common_header.hpp" +#include "ck/stream_config.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" namespace ck { namespace tensor_operation { @@ -26,7 +27,16 @@ template struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K - : public DeviceConvFwd + : public DeviceConvFwd<3, + ck::tensor_layout::convolution::NDHWC, + ck::tensor_layout::convolution::KZYXC, + ck::tensor_layout::convolution::NDHWK, + InDataType, + WeiDataType, + OutDataType, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> { using DeviceOp = DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K; @@ -57,6 +67,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W WeiElementwiseOperation wei_element_op, OutElementwiseOperation out_element_op) : params_{3, + 1, // G (group count, always 1 for non-grouped) N, K, C, @@ -78,7 +89,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W } // private: - utils::conv::ConvParams params_; + utils::conv::ConvParam params_; std::vector out_spatial_lengths_; const InDataType* p_in_; @@ -97,46 +108,28 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - const auto naive_conv3d_fwd = - ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk; + using InLayout = ck::tensor_layout::convolution::GNCDHW; + using WeiLayout = ck::tensor_layout::convolution::GKCZYX; + using OutLayout = ck::tensor_layout::convolution::GNKDHW; - float ave_time = launch_and_time_kernel(stream_config, - naive_conv3d_fwd, - dim3(256), - dim3(256), - 0, - arg.p_in_, - arg.p_wei_, - arg.p_out_, - arg.N_, - arg.K_, - arg.C_, - arg.in_spatial_lengths_[0], - arg.in_spatial_lengths_[1], - arg.in_spatial_lengths_[2], - arg.filter_spatial_lengths_[0], - arg.filter_spatial_lengths_[1], - arg.filter_spatial_lengths_[2], - arg.out_spatial_lengths_[0], - arg.out_spatial_lengths_[1], - arg.out_spatial_lengths_[2], - arg.conv_filter_strides_[0], - arg.conv_filter_strides_[1], - arg.conv_filter_strides_[2], - arg.conv_filter_dilations_[0], - arg.conv_filter_dilations_[1], - arg.conv_filter_dilations_[2], - arg.in_left_pads_[0], - arg.in_left_pads_[1], - arg.in_left_pads_[2]); - - return ave_time; + // Use simplified ConvParam-based API + ref::naive_conv_fwd(arg.p_in_, + arg.p_wei_, + arg.p_out_, + arg.params_, + arg.in_element_op_, + arg.wei_element_op_, + arg.out_element_op_, + stream_config.stream_id_); + return 0; // No timing for naive implementation } // polymorphic @@ -155,7 +148,9 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W static bool IsSupportedArgument(const Argument& arg) { - std::vector out_spatial_lengths = arg.params_.GetOutputSpatialLengths(); + auto out_spatial_lengths_long = arg.params_.GetOutputSpatialLengths(); + std::vector out_spatial_lengths(out_spatial_lengths_long.begin(), + out_spatial_lengths_long.end()); bool out_lengths_are_consistent = out_spatial_lengths[0] == arg.out_spatial_lengths_[0] && out_spatial_lengths[1] == arg.out_spatial_lengths_[1] && diff --git a/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp b/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp deleted file mode 100644 index 285271c6ef..0000000000 --- a/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#ifndef CONV_COMMON_HPP -#define CONV_COMMON_HPP - -#include "ck/ck.hpp" - -namespace ck { -namespace ref { - -// Device-compatible dimension structure for GPU reference kernels -// Replaces passing 24 individual parameters -struct ConvDims -{ - index_t N, K, C; - index_t Di, Hi, Wi; - index_t Z, Y, X; - index_t Do, Ho, Wo; - index_t stride_z, stride_y, stride_x; - index_t dilation_z, dilation_y, dilation_x; - index_t pad_z, pad_y, pad_x; -}; - -} // namespace ref - -// Helper function to extract dimensions from ConvParam for GPU kernels -// Defined in ck::utils::conv namespace for convenience -namespace utils { -namespace conv { - -inline ck::ref::ConvDims -extract_conv_dims(const ConvParam& conv_param, ck::index_t NDimSpatial, bool apply_group = true) -{ - ck::ref::ConvDims dims; - dims.N = conv_param.N_; - dims.K = conv_param.K_; - dims.C = apply_group ? (conv_param.C_ * conv_param.G_) : conv_param.C_; - - dims.Di = (NDimSpatial >= 3) ? conv_param.input_spatial_lengths_[0] : 1; - dims.Hi = (NDimSpatial >= 2) ? conv_param.input_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1; - dims.Wi = conv_param.input_spatial_lengths_[NDimSpatial - 1]; - - dims.Z = (NDimSpatial >= 3) ? conv_param.filter_spatial_lengths_[0] : 1; - dims.Y = (NDimSpatial >= 2) ? conv_param.filter_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1; - dims.X = conv_param.filter_spatial_lengths_[NDimSpatial - 1]; - - dims.Do = (NDimSpatial >= 3) ? conv_param.output_spatial_lengths_[0] : 1; - dims.Ho = (NDimSpatial >= 2) ? conv_param.output_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1; - dims.Wo = conv_param.output_spatial_lengths_[NDimSpatial - 1]; - - dims.stride_z = (NDimSpatial >= 3) ? conv_param.conv_filter_strides_[0] : 1; - dims.stride_y = - (NDimSpatial >= 2) ? conv_param.conv_filter_strides_[NDimSpatial >= 3 ? 1 : 0] : 1; - dims.stride_x = conv_param.conv_filter_strides_[NDimSpatial - 1]; - - dims.dilation_z = (NDimSpatial >= 3) ? conv_param.conv_filter_dilations_[0] : 1; - dims.dilation_y = - (NDimSpatial >= 2) ? conv_param.conv_filter_dilations_[NDimSpatial >= 3 ? 1 : 0] : 1; - dims.dilation_x = conv_param.conv_filter_dilations_[NDimSpatial - 1]; - - dims.pad_z = (NDimSpatial >= 3) ? conv_param.input_left_pads_[0] : 0; - dims.pad_y = (NDimSpatial >= 2) ? conv_param.input_left_pads_[NDimSpatial >= 3 ? 1 : 0] : 0; - dims.pad_x = conv_param.input_left_pads_[NDimSpatial - 1]; - - return dims; -} - -} // namespace conv -} // namespace utils -} // namespace ck - -#endif diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp index 686b1a0d34..aecf519c10 100644 --- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp +++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp @@ -4,146 +4,515 @@ #pragma once #include "ck/utility/type_convert.hpp" -#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { namespace ref { -/* - * \brief naive implementation of 3D convolution backward data. - * Layout is (NDHWC, KZYXC, NDHWK). - * Computes gradient with respect to input. - * - * \param N number of batches - * \param K number of filters (output channels) - * \param C number of input channels - * \param (Di, Hi, Wi) depth, height and width dimension of input - * \param (Z, Y, X) depth, height and width dimensions of filter - * \param (Do, Ho, Wo) depth, height and width dimension of output - * \param (stride_z, stride_y, stride_x) strides - * \param (dilation_z, dilation_y, dilation_x) dilations - * \param (pad_z, pad_y, pad_x) pads - */ -template -__global__ void naive_conv_bwd_data_ndhwc_kzyxc_ndhwk(TIn* __restrict__ p_in_grad, - const TWei* __restrict__ p_wei, - const TOut* __restrict__ p_out_grad, - const ConvDims dims) +// Optimized backward data convolution kernel working with packed (contiguous) tensors +// Computes gradients w.r.t. input from output gradients and weights +// Assumes row-major packing: input[G][N][C][spatial], weight[G][K][C][filter], +// output[G][N][K][spatial] +template +__global__ void naive_conv_bwd_data_packed(InDataType* __restrict__ p_in, + const WeiDataType* __restrict__ p_wei, + const OutDataType* __restrict__ p_out, + index_t G, + index_t N, + index_t K, + index_t C, + index_t Di, + index_t Hi, + index_t Wi, + index_t Z, + index_t Y, + index_t X, + index_t Do, + index_t Ho, + index_t Wo, + index_t stride_z, + index_t stride_y, + index_t stride_x, + index_t dilation_z, + index_t dilation_y, + index_t dilation_x, + index_t pad_z, + index_t pad_y, + index_t pad_x, + InElementOp in_op, + WeiElementOp wei_op, + OutElementOp out_op) { - const index_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const index_t num_threads = blockDim.x * gridDim.x; - const long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C; + const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const long_index_t num_threads = blockDim.x * gridDim.x; - const index_t in_strides[] = { - dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C}; - const index_t out_strides[] = { - dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K}; - const index_t wei_strides[] = { - dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C}; + InDataType in_val = InDataType{0}; + WeiDataType wei_val = WeiDataType{0}; + OutDataType out_val = OutDataType{0}; - constexpr auto in_op = InElementwiseOperation{}; - constexpr auto wei_op = WeiElementwiseOperation{}; - constexpr auto out_op = OutElementwiseOperation{}; - - TIn in_val = TIn{0}; - TWei wei_val = TWei{0}; - TOut out_val = TOut{0}; - - for(long_index_t ii = tid; ii < input_length; ii += num_threads) + if constexpr(NDimSpatial == 1) { - // Decode linear index to (n, di, hi, wi, c) - const index_t n = ii / in_strides[0]; - index_t tmp = ii - n * in_strides[0]; - const index_t di = tmp / in_strides[1]; - tmp -= di * in_strides[1]; - const index_t hi = tmp / in_strides[2]; - tmp -= hi * in_strides[2]; - const index_t wi = tmp / in_strides[3]; - tmp -= wi * in_strides[3]; - const index_t c = tmp; + const long_index_t num_in = G * N * C * Wi; + const long_index_t out_stride_g = N * K * Wo; + const long_index_t out_stride_n = K * Wo; + const long_index_t out_stride_k = Wo; + const long_index_t wei_stride_g = K * C * X; + const long_index_t wei_stride_k = C * X; + const long_index_t wei_stride_c = X; + const long_index_t in_stride_g = N * C * Wi; + const long_index_t in_stride_n = C * Wi; + const long_index_t in_stride_c = Wi; - // Always accumulate in float - float acc_float = 0.0f; - - const TOut* out_n = p_out_grad + static_cast(n) * out_strides[0]; - - // Loop over output channels - for(index_t k = 0; k < dims.K; ++k) + for(long_index_t idx = tid; idx < num_in; idx += num_threads) { - const TWei* wei_k = p_wei + static_cast(k) * wei_strides[0]; + index_t remaining = idx; + const index_t wi = remaining % Wi; + remaining /= Wi; + const index_t c = remaining % C; + remaining /= C; + const index_t n = remaining % N; + const index_t g = remaining / N; - // Loop over filter dimensions - for(index_t z = 0; z < dims.Z; ++z) + float acc = 0.0f; + const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n; + const WeiDataType* wei_g = p_wei + g * wei_stride_g; + + for(index_t x = 0; x < X; ++x) { - // Calculate output position from input position (inverse of forward) - index_t d_tmp = di + dims.pad_z - z * dims.dilation_z; - if(d_tmp % dims.stride_z != 0) - continue; - index_t d_o = d_tmp / dims.stride_z; - if(d_o < 0 || d_o >= dims.Do) - continue; - - const TOut* out_n_do = out_n + d_o * out_strides[1]; - const TWei* wei_k_z = wei_k + z * wei_strides[1]; - - for(index_t y = 0; y < dims.Y; ++y) + long_index_t w_tmp = wi + pad_x - x * dilation_x; + if(w_tmp % stride_x == 0) { - index_t h_tmp = hi + dims.pad_y - y * dims.dilation_y; - if(h_tmp % dims.stride_y != 0) - continue; - index_t ho = h_tmp / dims.stride_y; - if(ho < 0 || ho >= dims.Ho) - continue; - - const TOut* out_n_do_ho = out_n_do + ho * out_strides[2]; - const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2]; - - for(index_t x = 0; x < dims.X; ++x) + long_index_t wo = w_tmp / stride_x; + if(wo >= 0 && wo < Wo) { - index_t w_tmp = wi + dims.pad_x - x * dims.dilation_x; - if(w_tmp % dims.stride_x != 0) - continue; - index_t wo = w_tmp / dims.stride_x; - if(wo < 0 || wo >= dims.Wo) - continue; + const OutDataType* out_gnk = out_gn; + const WeiDataType* wei_gkc = wei_g + c * wei_stride_c; - const TOut* out_n_do_ho_wo = out_n_do_ho + wo * out_strides[3]; - const TWei* wei_k_z_y_x = wei_k_z_y + x * wei_strides[3]; - - // Load values from memory - TOut out_loaded = out_n_do_ho_wo[k]; - TWei wei_loaded = wei_k_z_y_x[c]; - - // Apply element-wise operations (like forward does) - out_op(out_val, out_loaded); - wei_op(wei_val, wei_loaded); - - // Convert to float for multiplication - float out_f = type_convert(out_val); - float wei_f = type_convert(wei_val); - - acc_float += out_f * wei_f; + for(index_t k = 0; k < K; ++k) + { + out_op(out_val, out_gnk[k * out_stride_k + wo]); + wei_op(wei_val, wei_gkc[k * wei_stride_k + x]); + acc += type_convert(out_val) * type_convert(wei_val); + } } } } + + InDataType result = type_convert(acc); + in_op(in_val, result); + p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + wi] = in_val; } + } + else if constexpr(NDimSpatial == 2) + { + const long_index_t num_in = G * N * C * Hi * Wi; + const long_index_t out_stride_g = N * K * Ho * Wo; + const long_index_t out_stride_n = K * Ho * Wo; + const long_index_t out_stride_k = Ho * Wo; + const long_index_t out_stride_h = Wo; + const long_index_t wei_stride_g = K * C * Y * X; + const long_index_t wei_stride_k = C * Y * X; + const long_index_t wei_stride_c = Y * X; + const long_index_t wei_stride_y = X; + const long_index_t in_stride_g = N * C * Hi * Wi; + const long_index_t in_stride_n = C * Hi * Wi; + const long_index_t in_stride_c = Hi * Wi; + const long_index_t in_stride_h = Wi; - // Convert float accumulator to TAcc, then to input type - TAcc acc = type_convert(acc_float); - TIn result = type_convert(acc); + for(long_index_t idx = tid; idx < num_in; idx += num_threads) + { + index_t remaining = idx; + const index_t wi = remaining % Wi; + remaining /= Wi; + const index_t hi = remaining % Hi; + remaining /= Hi; + const index_t c = remaining % C; + remaining /= C; + const index_t n = remaining % N; + const index_t g = remaining / N; - // Apply input element-wise operation (if any) - in_op(in_val, result); + float acc = 0.0f; + const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n; + const WeiDataType* wei_g = p_wei + g * wei_stride_g; - // Write transformed result - p_in_grad[ii] = in_val; + for(index_t y = 0; y < Y; ++y) + { + long_index_t h_tmp = hi + pad_y - y * dilation_y; + if(h_tmp % stride_y == 0) + { + long_index_t ho = h_tmp / stride_y; + if(ho >= 0 && ho < Ho) + { + const OutDataType* out_gnkh = out_gn + ho * out_stride_h; + const WeiDataType* wei_gkcy = wei_g + c * wei_stride_c + y * wei_stride_y; + + for(index_t x = 0; x < X; ++x) + { + long_index_t w_tmp = wi + pad_x - x * dilation_x; + if(w_tmp % stride_x == 0) + { + long_index_t wo = w_tmp / stride_x; + if(wo >= 0 && wo < Wo) + { + for(index_t k = 0; k < K; ++k) + { + out_op(out_val, out_gnkh[k * out_stride_k + wo]); + wei_op(wei_val, wei_gkcy[k * wei_stride_k + x]); + acc += type_convert(out_val) * + type_convert(wei_val); + } + } + } + } + } + } + } + + InDataType result = type_convert(acc); + in_op(in_val, result); + p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + hi * in_stride_h + wi] = + in_val; + } + } + else if constexpr(NDimSpatial == 3) + { + const long_index_t num_in = G * N * C * Di * Hi * Wi; + const long_index_t out_stride_g = N * K * Do * Ho * Wo; + const long_index_t out_stride_n = K * Do * Ho * Wo; + const long_index_t out_stride_k = Do * Ho * Wo; + const long_index_t out_stride_d = Ho * Wo; + const long_index_t out_stride_h = Wo; + const long_index_t wei_stride_g = K * C * Z * Y * X; + const long_index_t wei_stride_k = C * Z * Y * X; + const long_index_t wei_stride_c = Z * Y * X; + const long_index_t wei_stride_z = Y * X; + const long_index_t wei_stride_y = X; + const long_index_t in_stride_g = N * C * Di * Hi * Wi; + const long_index_t in_stride_n = C * Di * Hi * Wi; + const long_index_t in_stride_c = Di * Hi * Wi; + const long_index_t in_stride_d = Hi * Wi; + const long_index_t in_stride_h = Wi; + + for(long_index_t idx = tid; idx < num_in; idx += num_threads) + { + index_t remaining = idx; + const index_t wi = remaining % Wi; + remaining /= Wi; + const index_t hi = remaining % Hi; + remaining /= Hi; + const index_t di = remaining % Di; + remaining /= Di; + const index_t c = remaining % C; + remaining /= C; + const index_t n = remaining % N; + const index_t g = remaining / N; + + float acc = 0.0f; + const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n; + const WeiDataType* wei_g = p_wei + g * wei_stride_g; + + for(index_t z = 0; z < Z; ++z) + { + long_index_t d_tmp = di + pad_z - z * dilation_z; + if(d_tmp % stride_z == 0) + { + long_index_t do_idx = d_tmp / stride_z; + if(do_idx >= 0 && do_idx < Do) + { + const OutDataType* out_gnkd = out_gn + do_idx * out_stride_d; + const WeiDataType* wei_gkcz = wei_g + c * wei_stride_c + z * wei_stride_z; + + for(index_t y = 0; y < Y; ++y) + { + long_index_t h_tmp = hi + pad_y - y * dilation_y; + if(h_tmp % stride_y == 0) + { + long_index_t ho = h_tmp / stride_y; + if(ho >= 0 && ho < Ho) + { + const OutDataType* out_gnkdh = out_gnkd + ho * out_stride_h; + const WeiDataType* wei_gkczy = wei_gkcz + y * wei_stride_y; + + for(index_t x = 0; x < X; ++x) + { + long_index_t w_tmp = wi + pad_x - x * dilation_x; + if(w_tmp % stride_x == 0) + { + long_index_t wo = w_tmp / stride_x; + if(wo >= 0 && wo < Wo) + { + for(index_t k = 0; k < K; ++k) + { + out_op(out_val, + out_gnkdh[k * out_stride_k + wo]); + wei_op(wei_val, + wei_gkczy[k * wei_stride_k + x]); + acc += type_convert(out_val) * + type_convert(wei_val); + } + } + } + } + } + } + } + } + } + } + + InDataType result = type_convert(acc); + in_op(in_val, result); + p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + di * in_stride_d + + hi * in_stride_h + wi] = in_val; + } } } + +// GPU reference backward data convolution - takes ConvParam directly +template +void naive_conv_bwd_data(TIn* p_in, + const TWei* p_wei, + const TOut* p_out, + const ck::utils::conv::ConvParam& conv_param, + InElementwiseOperation in_element_op = InElementwiseOperation{}, + WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{}, + OutElementwiseOperation out_element_op = OutElementwiseOperation{}, + hipStream_t stream = nullptr) +{ + const auto ndim = conv_param.num_dim_spatial_; + + const index_t G = conv_param.G_; + const index_t N = conv_param.N_; + const index_t C = conv_param.C_; + const index_t K = conv_param.K_; + + std::vector in_lengths = {G, N, C}; + std::vector wei_lengths = {G, K, C}; + std::vector out_lengths = {G, N, K}; + + for(index_t i = 0; i < ndim; ++i) + { + in_lengths.push_back(static_cast(conv_param.input_spatial_lengths_[i])); + wei_lengths.push_back(static_cast(conv_param.filter_spatial_lengths_[i])); + out_lengths.push_back(static_cast(conv_param.output_spatial_lengths_[i])); + } + + // Calculate total elements for buffer allocation + long_index_t in_total = 1, wei_total = 1, out_total = 1; + for(auto l : in_lengths) + in_total *= l; + for(auto l : wei_lengths) + wei_total *= l; + for(auto l : out_lengths) + out_total *= l; + + // Allocate packed buffers + SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn)); + SimpleDeviceMem wei_packed_buf(wei_total * sizeof(TWei)); + SimpleDeviceMem out_packed_buf(out_total * sizeof(TOut)); + + TIn* p_in_packed = static_cast(in_packed_buf.GetDeviceBuffer()); + TWei* p_wei_packed = static_cast(wei_packed_buf.GetDeviceBuffer()); + TOut* p_out_packed = static_cast(out_packed_buf.GetDeviceBuffer()); + + // Compute strides and allocate device arrays for pack/unpack + std::vector in_strides = compute_conv_tensor_strides(in_lengths, ndim); + std::vector wei_strides = compute_conv_tensor_strides(wei_lengths, ndim); + std::vector out_strides = compute_conv_tensor_strides(out_lengths, ndim); + + const size_t dim_count = in_lengths.size(); + SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t)); + + index_t* d_in_lengths = static_cast(in_lengths_buf.GetDeviceBuffer()); + index_t* d_in_strides = static_cast(in_strides_buf.GetDeviceBuffer()); + index_t* d_wei_lengths = static_cast(wei_lengths_buf.GetDeviceBuffer()); + index_t* d_wei_strides = static_cast(wei_strides_buf.GetDeviceBuffer()); + index_t* d_out_lengths = static_cast(out_lengths_buf.GetDeviceBuffer()); + index_t* d_out_strides = static_cast(out_strides_buf.GetDeviceBuffer()); + + HIP_CHECK_ERROR(hipMemcpy( + d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + + // Pack output and weight tensors to contiguous layout (inputs to bwd data) + constexpr int block_size = 256; + strided_copy_kernel + <<<(out_total + block_size - 1) / block_size, block_size, 0, stream>>>( + p_out, p_out_packed, d_out_lengths, d_out_strides, dim_count, out_total); + strided_copy_kernel + <<<(wei_total + block_size - 1) / block_size, block_size, 0, stream>>>( + p_wei, p_wei_packed, d_wei_lengths, d_wei_strides, dim_count, wei_total); + + // Build conv parameter vectors for kernel invocation + std::vector conv_strides(ndim); + std::vector conv_dilations(ndim); + std::vector input_pads(ndim); + for(index_t i = 0; i < ndim; ++i) + { + conv_strides[i] = static_cast(conv_param.conv_filter_strides_[i]); + conv_dilations[i] = static_cast(conv_param.conv_filter_dilations_[i]); + input_pads[i] = static_cast(conv_param.input_left_pads_[i]); + } + + // Run backward data convolution kernel on packed data + const int in_grid = (in_total + block_size - 1) / block_size; + + if(ndim == 1) + { + naive_conv_bwd_data_packed<1, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_packed, + p_out_packed, + G, + N, + K, + C, + 1, + 1, + in_lengths[3], + 1, + 1, + wei_lengths[3], + 1, + 1, + out_lengths[3], + 1, + 1, + conv_strides[0], + 1, + 1, + conv_dilations[0], + 0, + 0, + input_pads[0], + in_element_op, + wei_element_op, + out_element_op); + } + else if(ndim == 2) + { + naive_conv_bwd_data_packed<2, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_packed, + p_out_packed, + G, + N, + K, + C, + 1, + in_lengths[3], + in_lengths[4], + 1, + wei_lengths[3], + wei_lengths[4], + 1, + out_lengths[3], + out_lengths[4], + 1, + conv_strides[0], + conv_strides[1], + 1, + conv_dilations[0], + conv_dilations[1], + 0, + input_pads[0], + input_pads[1], + in_element_op, + wei_element_op, + out_element_op); + } + else // 3D + { + naive_conv_bwd_data_packed<3, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_packed, + p_out_packed, + G, + N, + K, + C, + in_lengths[3], + in_lengths[4], + in_lengths[5], + wei_lengths[3], + wei_lengths[4], + wei_lengths[5], + out_lengths[3], + out_lengths[4], + out_lengths[5], + conv_strides[0], + conv_strides[1], + conv_strides[2], + conv_dilations[0], + conv_dilations[1], + conv_dilations[2], + input_pads[0], + input_pads[1], + input_pads[2], + in_element_op, + wei_element_op, + out_element_op); + } + + // Unpack result back to strided layout + strided_copy_kernel<<>>( + p_in_packed, p_in, d_in_lengths, d_in_strides, dim_count, in_total); + + HIP_CHECK_ERROR(hipGetLastError()); + + // Memory automatically freed by SimpleDeviceMem destructors +} + } // namespace ref } // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp index ff44c9fcbd..f46b072baa 100644 --- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp +++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp @@ -4,136 +4,497 @@ #pragma once #include "ck/utility/type_convert.hpp" -#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { namespace ref { -/* - * \brief naive implementation of 3D convolution backward weight. - * Layout is (NDHWC, KZYXC, NDHWK). - * Computes gradient with respect to weights. - * - * \param N number of batches - * \param K number of filters (output channels) - * \param C number of input channels - * \param (Di, Hi, Wi) depth, height and width dimension of input - * \param (Z, Y, X) depth, height and width dimensions of filter - * \param (Do, Ho, Wo) depth, height and width dimension of output - * \param (stride_z, stride_y, stride_x) strides - * \param (dilation_z, dilation_y, dilation_x) dilations - * \param (pad_z, pad_y, pad_x) pads - */ -template -__global__ void naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in, - TWei* __restrict__ p_wei_grad, - const TOut* __restrict__ p_out_grad, - const ConvDims dims) +// Optimized backward weight convolution kernel working with packed (contiguous) tensors +// Assumes row-major packing: input[G][N][C][spatial], output_grad[G][N][K][spatial], +// weight_grad[G][K][C][filter] +// Computes gradient with respect to weights +template +__global__ void naive_conv_bwd_weight_packed(const InDataType* __restrict__ p_in, + WeiDataType* __restrict__ p_wei_grad, + const OutDataType* __restrict__ p_out_grad, + index_t G, + index_t N, + index_t K, + index_t C, + index_t Di, + index_t Hi, + index_t Wi, + index_t Z, + index_t Y, + index_t X, + index_t Do, + index_t Ho, + index_t Wo, + index_t stride_z, + index_t stride_y, + index_t stride_x, + index_t dilation_z, + index_t dilation_y, + index_t dilation_x, + index_t pad_z, + index_t pad_y, + index_t pad_x, + InElementOp in_op, + WeiElementOp wei_op, + OutElementOp out_op) { - const index_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const index_t num_threads = blockDim.x * gridDim.x; - const long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C; + const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const long_index_t num_threads = blockDim.x * gridDim.x; - const index_t in_strides[] = { - dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C}; - const index_t out_strides[] = { - dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K}; - const index_t wei_strides[] = { - dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C}; + InDataType in_val = InDataType{0}; + WeiDataType wei_val = WeiDataType{0}; + OutDataType out_val = OutDataType{0}; - constexpr auto in_op = InElementwiseOperation{}; - constexpr auto wei_op = WeiElementwiseOperation{}; - constexpr auto out_op = OutElementwiseOperation{}; - - TIn in_val = TIn{0}; - TWei wei_val = TWei{0}; - TOut out_val = TOut{0}; - - for(long_index_t ii = tid; ii < weight_length; ii += num_threads) + if constexpr(NDimSpatial == 1) { - // Decode linear index to (k, z, y, x, c) - const index_t k = ii / wei_strides[0]; - index_t tmp = ii - k * wei_strides[0]; - const index_t z = tmp / wei_strides[1]; - tmp -= z * wei_strides[1]; - const index_t y = tmp / wei_strides[2]; - tmp -= y * wei_strides[2]; - const index_t x = tmp / wei_strides[3]; - tmp -= x * wei_strides[3]; - const index_t c = tmp; + const long_index_t num_wei = G * K * C * X; + const long_index_t in_stride_g = N * C * Wi; + const long_index_t in_stride_n = C * Wi; + const long_index_t in_stride_c = Wi; + const long_index_t out_stride_g = N * K * Wo; + const long_index_t out_stride_n = K * Wo; + const long_index_t out_stride_k = Wo; + const long_index_t wei_stride_g = K * C * X; + const long_index_t wei_stride_k = C * X; + const long_index_t wei_stride_c = X; - // Always accumulate in float - float acc_float = 0.0f; - - // Loop over batch - for(index_t n = 0; n < dims.N; ++n) + for(long_index_t idx = tid; idx < num_wei; idx += num_threads) { - const TIn* in_n = p_in + static_cast(n) * in_strides[0]; - const TOut* out_n = p_out_grad + static_cast(n) * out_strides[0]; + index_t remaining = idx; + const index_t x = remaining % X; + remaining /= X; + const index_t c = remaining % C; + remaining /= C; + const index_t k = remaining % K; + const index_t g = remaining / K; - // Loop over output spatial dimensions - for(index_t d_o = 0; d_o < dims.Do; ++d_o) + float acc = 0.0f; + const InDataType* in_g = p_in + g * in_stride_g; + const OutDataType* out_grad = p_out_grad + g * out_stride_g; + + // Loop over batch and output positions + for(index_t n = 0; n < N; ++n) { - // Calculate input position from output position - index_t di = d_o * dims.stride_z - dims.pad_z + z * dims.dilation_z; - if(di < 0 || di >= dims.Di) - continue; + const InDataType* in_gn = in_g + n * in_stride_n + c * in_stride_c; + const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k; - const TIn* in_n_di = in_n + di * in_strides[1]; - const TOut* out_n_do = out_n + d_o * out_strides[1]; - - for(index_t ho = 0; ho < dims.Ho; ++ho) + for(index_t wo = 0; wo < Wo; ++wo) { - index_t hi = ho * dims.stride_y - dims.pad_y + y * dims.dilation_y; - if(hi < 0 || hi >= dims.Hi) - continue; - - const TIn* in_n_di_hi = in_n_di + hi * in_strides[2]; - const TOut* out_n_do_ho = out_n_do + ho * out_strides[2]; - - for(index_t wo = 0; wo < dims.Wo; ++wo) + long_index_t wi = wo * stride_x + x * dilation_x - pad_x; + if(wi >= 0 && wi < Wi) { - index_t wi = wo * dims.stride_x - dims.pad_x + x * dims.dilation_x; - if(wi < 0 || wi >= dims.Wi) - continue; - - // Load values from memory (like forward does) - const TIn* in_ptr = in_n_di_hi + wi * in_strides[3]; - const TOut* out_ptr = out_n_do_ho + wo * out_strides[3]; - - TIn in_loaded = in_ptr[c]; - TOut out_loaded = out_ptr[k]; - - // Apply element-wise operations - in_op(in_val, in_loaded); - out_op(out_val, out_loaded); - - // Convert to float for multiplication - float in_f = type_convert(in_val); - float out_f = type_convert(out_val); - - acc_float += out_f * in_f; + in_op(in_val, in_gn[wi]); + out_op(out_val, out_gn_k[wo]); + acc += type_convert(out_val) * type_convert(in_val); } } } + + WeiDataType result = type_convert(acc); + wei_op(wei_val, result); + p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + x] = wei_val; } + } + else if constexpr(NDimSpatial == 2) + { + const long_index_t num_wei = G * K * C * Y * X; + const long_index_t in_stride_g = N * C * Hi * Wi; + const long_index_t in_stride_n = C * Hi * Wi; + const long_index_t in_stride_c = Hi * Wi; + const long_index_t in_stride_h = Wi; + const long_index_t out_stride_g = N * K * Ho * Wo; + const long_index_t out_stride_n = K * Ho * Wo; + const long_index_t out_stride_k = Ho * Wo; + const long_index_t out_stride_h = Wo; + const long_index_t wei_stride_g = K * C * Y * X; + const long_index_t wei_stride_k = C * Y * X; + const long_index_t wei_stride_c = Y * X; + const long_index_t wei_stride_y = X; - // Convert float accumulator to TAcc, then to weight type - TAcc acc = type_convert(acc_float); - TWei result = type_convert(acc); + for(long_index_t idx = tid; idx < num_wei; idx += num_threads) + { + index_t remaining = idx; + const index_t x = remaining % X; + remaining /= X; + const index_t y = remaining % Y; + remaining /= Y; + const index_t c = remaining % C; + remaining /= C; + const index_t k = remaining % K; + const index_t g = remaining / K; - // Apply weight element-wise operation (if any) - wei_op(wei_val, result); + float acc = 0.0f; + const InDataType* in_g = p_in + g * in_stride_g; + const OutDataType* out_grad = p_out_grad + g * out_stride_g; - // Write transformed result - p_wei_grad[ii] = wei_val; + // Loop over batch and output positions + for(index_t n = 0; n < N; ++n) + { + const InDataType* in_gnc = in_g + n * in_stride_n + c * in_stride_c; + const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k; + + for(index_t ho = 0; ho < Ho; ++ho) + { + long_index_t hi = ho * stride_y + y * dilation_y - pad_y; + if(hi >= 0 && hi < Hi) + { + const InDataType* in_gnch = in_gnc + hi * in_stride_h; + const OutDataType* out_gn_kh = out_gn_k + ho * out_stride_h; + + for(index_t wo = 0; wo < Wo; ++wo) + { + long_index_t wi = wo * stride_x + x * dilation_x - pad_x; + if(wi >= 0 && wi < Wi) + { + in_op(in_val, in_gnch[wi]); + out_op(out_val, out_gn_kh[wo]); + acc += type_convert(out_val) * type_convert(in_val); + } + } + } + } + } + + WeiDataType result = type_convert(acc); + wei_op(wei_val, result); + p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + y * wei_stride_y + + x] = wei_val; + } + } + else if constexpr(NDimSpatial == 3) + { + const long_index_t num_wei = G * K * C * Z * Y * X; + const long_index_t in_stride_g = N * C * Di * Hi * Wi; + const long_index_t in_stride_n = C * Di * Hi * Wi; + const long_index_t in_stride_c = Di * Hi * Wi; + const long_index_t in_stride_d = Hi * Wi; + const long_index_t in_stride_h = Wi; + const long_index_t out_stride_g = N * K * Do * Ho * Wo; + const long_index_t out_stride_n = K * Do * Ho * Wo; + const long_index_t out_stride_k = Do * Ho * Wo; + const long_index_t out_stride_d = Ho * Wo; + const long_index_t out_stride_h = Wo; + const long_index_t wei_stride_g = K * C * Z * Y * X; + const long_index_t wei_stride_k = C * Z * Y * X; + const long_index_t wei_stride_c = Z * Y * X; + const long_index_t wei_stride_z = Y * X; + const long_index_t wei_stride_y = X; + + for(long_index_t idx = tid; idx < num_wei; idx += num_threads) + { + index_t remaining = idx; + const index_t x = remaining % X; + remaining /= X; + const index_t y = remaining % Y; + remaining /= Y; + const index_t z = remaining % Z; + remaining /= Z; + const index_t c = remaining % C; + remaining /= C; + const index_t k = remaining % K; + const index_t g = remaining / K; + + float acc = 0.0f; + const InDataType* in_g = p_in + g * in_stride_g; + const OutDataType* out_grad = p_out_grad + g * out_stride_g; + + // Loop over batch and output positions + for(index_t n = 0; n < N; ++n) + { + const InDataType* in_gnc = in_g + n * in_stride_n + c * in_stride_c; + const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k; + + for(index_t do_idx = 0; do_idx < Do; ++do_idx) + { + long_index_t di = do_idx * stride_z + z * dilation_z - pad_z; + if(di >= 0 && di < Di) + { + const InDataType* in_gncd = in_gnc + di * in_stride_d; + const OutDataType* out_gn_kd = out_gn_k + do_idx * out_stride_d; + + for(index_t ho = 0; ho < Ho; ++ho) + { + long_index_t hi = ho * stride_y + y * dilation_y - pad_y; + if(hi >= 0 && hi < Hi) + { + const InDataType* in_gncdh = in_gncd + hi * in_stride_h; + const OutDataType* out_gn_kdh = out_gn_kd + ho * out_stride_h; + + for(index_t wo = 0; wo < Wo; ++wo) + { + long_index_t wi = wo * stride_x + x * dilation_x - pad_x; + if(wi >= 0 && wi < Wi) + { + in_op(in_val, in_gncdh[wi]); + out_op(out_val, out_gn_kdh[wo]); + acc += type_convert(out_val) * + type_convert(in_val); + } + } + } + } + } + } + } + + WeiDataType result = type_convert(acc); + wei_op(wei_val, result); + p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + z * wei_stride_z + + y * wei_stride_y + x] = wei_val; + } } } + +// GPU reference backward weight convolution - takes ConvParam directly +template +void naive_conv_bwd_weight(const TIn* p_in, + TWei* p_wei_grad, + const TOut* p_out, + const ck::utils::conv::ConvParam& conv_param, + InElementwiseOperation in_element_op = InElementwiseOperation{}, + WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{}, + OutElementwiseOperation out_element_op = OutElementwiseOperation{}, + hipStream_t stream = nullptr) +{ + const auto ndim = conv_param.num_dim_spatial_; + + const index_t G = conv_param.G_; + const index_t N = conv_param.N_; + const index_t C = conv_param.C_; + const index_t K = conv_param.K_; + + std::vector in_lengths = {G, N, C}; + std::vector wei_lengths = {G, K, C}; + std::vector out_lengths = {G, N, K}; + + for(index_t i = 0; i < ndim; ++i) + { + in_lengths.push_back(static_cast(conv_param.input_spatial_lengths_[i])); + wei_lengths.push_back(static_cast(conv_param.filter_spatial_lengths_[i])); + out_lengths.push_back(static_cast(conv_param.output_spatial_lengths_[i])); + } + + // Calculate total elements for buffer allocation + long_index_t in_total = 1, wei_total = 1, out_total = 1; + for(auto l : in_lengths) + in_total *= l; + for(auto l : wei_lengths) + wei_total *= l; + for(auto l : out_lengths) + out_total *= l; + + // Allocate packed buffers + SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn)); + SimpleDeviceMem wei_grad_packed_buf(wei_total * sizeof(TWei)); + SimpleDeviceMem out_grad_packed_buf(out_total * sizeof(TOut)); + + TIn* p_in_packed = static_cast(in_packed_buf.GetDeviceBuffer()); + TWei* p_wei_grad_packed = static_cast(wei_grad_packed_buf.GetDeviceBuffer()); + TOut* p_out_grad_packed = static_cast(out_grad_packed_buf.GetDeviceBuffer()); + + // Compute strides and allocate device arrays for pack/unpack + std::vector in_strides = compute_conv_tensor_strides(in_lengths, ndim); + std::vector wei_strides = compute_conv_tensor_strides(wei_lengths, ndim); + std::vector out_strides = compute_conv_tensor_strides(out_lengths, ndim); + + const size_t dim_count = in_lengths.size(); + SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t)); + + index_t* d_in_lengths = static_cast(in_lengths_buf.GetDeviceBuffer()); + index_t* d_in_strides = static_cast(in_strides_buf.GetDeviceBuffer()); + index_t* d_wei_lengths = static_cast(wei_lengths_buf.GetDeviceBuffer()); + index_t* d_wei_strides = static_cast(wei_strides_buf.GetDeviceBuffer()); + index_t* d_out_lengths = static_cast(out_lengths_buf.GetDeviceBuffer()); + index_t* d_out_strides = static_cast(out_strides_buf.GetDeviceBuffer()); + + HIP_CHECK_ERROR(hipMemcpy( + d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + + // Pack input and output_grad tensors to contiguous layout (inputs to bwd weight) + constexpr int block_size = 256; + strided_copy_kernel + <<<(in_total + block_size - 1) / block_size, block_size, 0, stream>>>( + p_in, p_in_packed, d_in_lengths, d_in_strides, dim_count, in_total); + strided_copy_kernel + <<<(out_total + block_size - 1) / block_size, block_size, 0, stream>>>( + p_out, p_out_grad_packed, d_out_lengths, d_out_strides, dim_count, out_total); + + // Build conv parameter vectors for kernel invocation + std::vector conv_strides(ndim); + std::vector conv_dilations(ndim); + std::vector input_pads(ndim); + for(index_t i = 0; i < ndim; ++i) + { + conv_strides[i] = static_cast(conv_param.conv_filter_strides_[i]); + conv_dilations[i] = static_cast(conv_param.conv_filter_dilations_[i]); + input_pads[i] = static_cast(conv_param.input_left_pads_[i]); + } + + // Run backward weight convolution kernel on packed data + const int wei_grid = (wei_total + block_size - 1) / block_size; + + if(ndim == 1) + { + naive_conv_bwd_weight_packed<1, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_grad_packed, + p_out_grad_packed, + G, + N, + K, + C, + 1, + 1, + in_lengths[3], + 1, + 1, + wei_lengths[3], + 1, + 1, + out_lengths[3], + 1, + 1, + conv_strides[0], + 1, + 1, + conv_dilations[0], + 0, + 0, + input_pads[0], + in_element_op, + wei_element_op, + out_element_op); + } + else if(ndim == 2) + { + naive_conv_bwd_weight_packed<2, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_grad_packed, + p_out_grad_packed, + G, + N, + K, + C, + 1, + in_lengths[3], + in_lengths[4], + 1, + wei_lengths[3], + wei_lengths[4], + 1, + out_lengths[3], + out_lengths[4], + 1, + conv_strides[0], + conv_strides[1], + 1, + conv_dilations[0], + conv_dilations[1], + 0, + input_pads[0], + input_pads[1], + in_element_op, + wei_element_op, + out_element_op); + } + else // 3D + { + naive_conv_bwd_weight_packed<3, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_grad_packed, + p_out_grad_packed, + G, + N, + K, + C, + in_lengths[3], + in_lengths[4], + in_lengths[5], + wei_lengths[3], + wei_lengths[4], + wei_lengths[5], + out_lengths[3], + out_lengths[4], + out_lengths[5], + conv_strides[0], + conv_strides[1], + conv_strides[2], + conv_dilations[0], + conv_dilations[1], + conv_dilations[2], + input_pads[0], + input_pads[1], + input_pads[2], + in_element_op, + wei_element_op, + out_element_op); + } + + // Unpack weight gradient + strided_copy_kernel<<>>( + p_wei_grad_packed, p_wei_grad, d_wei_lengths, d_wei_strides, dim_count, wei_total); + + HIP_CHECK_ERROR(hipGetLastError()); + + // Memory automatically freed by SimpleDeviceMem destructors +} + } // namespace ref } // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp index defbbd5be4..131b632a25 100644 --- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp +++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp @@ -4,126 +4,493 @@ #pragma once #include "ck/utility/type_convert.hpp" -#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { namespace ref { -/* - * \brief naive implementation of 3D convolution. Layout is (NDHWC, KZYXC, NDHWK). - * - * \param N number of batches - * \param K number of filters - * \param C number of channels of weight - * \param (Di, Hi, Wi) depth, height and width dimension of data - * \param (Z, Y, X) depth, height and width dimensions of weights - * \param (Do, Ho, Wo) depth, height and width dimension of output - * \param (stride_z, stride_y, stride_x) strides - * \param (dilation_z, dilation_y, dilation_x) dilations - * \param (pad_z, pad_y, pad_x) pads - */ -template -__global__ void naive_conv_fwd_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in, - const TWei* __restrict__ p_wei, - TOut* __restrict__ p_out, - const ConvDims dims) +// Optimized convolution kernel working with packed (contiguous) tensors +// Assumes row-major packing: input[G][N][C][spatial], weight[G][K][C][filter], +// output[G][N][K][spatial] +template +__global__ void naive_conv_fwd_packed(const InDataType* __restrict__ p_in, + const WeiDataType* __restrict__ p_wei, + OutDataType* __restrict__ p_out, + index_t G, + index_t N, + index_t K, + index_t C, + index_t Di, + index_t Hi, + index_t Wi, + index_t Z, + index_t Y, + index_t X, + index_t Do, + index_t Ho, + index_t Wo, + index_t stride_z, + index_t stride_y, + index_t stride_x, + index_t dilation_z, + index_t dilation_y, + index_t dilation_x, + index_t pad_z, + index_t pad_y, + index_t pad_x, + InElementOp in_op, + WeiElementOp wei_op, + OutElementOp out_op) { - const index_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const index_t num_threads = blockDim.x * gridDim.x; - const long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K; + const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const long_index_t num_threads = blockDim.x * gridDim.x; - const index_t out_strides[] = { - dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K}; - const index_t in_strides[] = { - dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C}; - const index_t wei_strides[] = { - dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C}; + InDataType in_val = InDataType{0}; + WeiDataType wei_val = WeiDataType{0}; + OutDataType out_val = OutDataType{0}; - constexpr auto in_op = InElementwiseOperation{}; - constexpr auto wei_op = WeiElementwiseOperation{}; - constexpr auto out_op = OutElementwiseOperation{}; - - TIn in_val = TIn{0}; - TWei wei_val = TWei{0}; - TOut out_val = TOut{0}; - - for(long_index_t ii = tid; ii < output_length; ii += num_threads) + if constexpr(NDimSpatial == 1) { - const index_t n = ii / out_strides[0]; - index_t k = ii - n * out_strides[0]; - const index_t dO = k / out_strides[1]; - k -= dO * out_strides[1]; - const index_t ho = k / out_strides[2]; - k -= ho * out_strides[2]; - const index_t wo = k / out_strides[3]; - k -= wo * out_strides[3]; + const long_index_t num_out = G * N * K * Wo; + const long_index_t in_stride_g = N * C * Wi; + const long_index_t in_stride_n = C * Wi; + const long_index_t in_stride_c = Wi; + const long_index_t wei_stride_g = K * C * X; + const long_index_t wei_stride_k = C * X; + const long_index_t wei_stride_c = X; + const long_index_t out_stride_g = N * K * Wo; + const long_index_t out_stride_n = K * Wo; + const long_index_t out_stride_k = Wo; - // Always accumulate in float (FP8/BF8 don't support arithmetic) - float acc_float = 0.0f; - - const TIn* in_n = p_in + static_cast(n) * in_strides[0]; - const TWei* wei_k = p_wei + static_cast(k) * wei_strides[0]; - - for(index_t z = 0; z < dims.Z; ++z) + for(long_index_t idx = tid; idx < num_out; idx += num_threads) { - index_t di = dims.stride_z * dO - dims.pad_z + dims.dilation_z * z; - const TIn* in_n_di = in_n + di * in_strides[1]; - const TWei* wei_k_z = wei_k + z * wei_strides[1]; + index_t remaining = idx; + const index_t wo = remaining % Wo; + remaining /= Wo; + const index_t k = remaining % K; + remaining /= K; + const index_t n = remaining % N; + const index_t g = remaining / N; - for(index_t y = 0; y < dims.Y; ++y) + float acc = 0.0f; + const InDataType* in_g = p_in + g * in_stride_g + n * in_stride_n; + const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k; + + for(index_t c = 0; c < C; ++c) { - index_t hi = dims.stride_y * ho - dims.pad_y + dims.dilation_y * y; - const TIn* in_n_di_hi = in_n_di + hi * in_strides[2]; - const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2]; + const InDataType* in_gc = in_g + c * in_stride_c; + const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c; - for(index_t x = 0; x < dims.X; ++x) + for(index_t x = 0; x < X; ++x) { - index_t wi = dims.stride_x * wo - dims.pad_x + dims.dilation_x * x; - const TIn* in_n_di_hi_wi = in_n_di_hi + wi * in_strides[3]; - const TWei* wei_k_z_y_x = wei_k_z_y + x * wei_strides[3]; - - if(di >= 0 && di < dims.Di && hi >= 0 && hi < dims.Hi && wi >= 0 && - wi < dims.Wi) + long_index_t wi = wo * stride_x + x * dilation_x - pad_x; + if(wi >= 0 && wi < Wi) { - for(index_t c = 0; c < dims.C; ++c) + in_op(in_val, in_gc[wi]); + wei_op(wei_val, wei_gkc[x]); + acc += type_convert(in_val) * type_convert(wei_val); + } + } + } + + OutDataType result = type_convert(acc); + out_op(out_val, result); + p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + wo] = out_val; + } + } + else if constexpr(NDimSpatial == 2) + { + const long_index_t num_out = G * N * K * Ho * Wo; + const long_index_t in_stride_g = N * C * Hi * Wi; + const long_index_t in_stride_n = C * Hi * Wi; + const long_index_t in_stride_c = Hi * Wi; + const long_index_t in_stride_h = Wi; + const long_index_t wei_stride_g = K * C * Y * X; + const long_index_t wei_stride_k = C * Y * X; + const long_index_t wei_stride_c = Y * X; + const long_index_t wei_stride_y = X; + const long_index_t out_stride_g = N * K * Ho * Wo; + const long_index_t out_stride_n = K * Ho * Wo; + const long_index_t out_stride_k = Ho * Wo; + const long_index_t out_stride_h = Wo; + + for(long_index_t idx = tid; idx < num_out; idx += num_threads) + { + index_t remaining = idx; + const index_t wo = remaining % Wo; + remaining /= Wo; + const index_t ho = remaining % Ho; + remaining /= Ho; + const index_t k = remaining % K; + remaining /= K; + const index_t n = remaining % N; + const index_t g = remaining / N; + + float acc = 0.0f; + const InDataType* in_gn = p_in + g * in_stride_g + n * in_stride_n; + const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k; + + for(index_t c = 0; c < C; ++c) + { + const InDataType* in_gnc = in_gn + c * in_stride_c; + const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c; + + for(index_t y = 0; y < Y; ++y) + { + long_index_t hi = ho * stride_y + y * dilation_y - pad_y; + if(hi >= 0 && hi < Hi) + { + const InDataType* in_gnch = in_gnc + hi * in_stride_h; + const WeiDataType* wei_gkcy = wei_gkc + y * wei_stride_y; + + for(index_t x = 0; x < X; ++x) { - // Load values from memory - TIn in_loaded = in_n_di_hi_wi[c]; - TWei wei_loaded = wei_k_z_y_x[c]; - - // Apply element-wise operations - in_op(in_val, in_loaded); - wei_op(wei_val, wei_loaded); - - // Always convert to float for multiplication (FP8/BF8 don't support - // direct arithmetic) - float in_f = type_convert(in_val); - float wei_f = type_convert(wei_val); - - // Accumulate in float - acc_float += in_f * wei_f; + long_index_t wi = wo * stride_x + x * dilation_x - pad_x; + if(wi >= 0 && wi < Wi) + { + in_op(in_val, in_gnch[wi]); + wei_op(wei_val, wei_gkcy[x]); + acc += type_convert(in_val) * type_convert(wei_val); + } } } } } + + OutDataType result = type_convert(acc); + out_op(out_val, result); + p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + ho * out_stride_h + wo] = + out_val; } + } + else if constexpr(NDimSpatial == 3) + { + const long_index_t num_out = G * N * K * Do * Ho * Wo; + const long_index_t in_stride_g = N * C * Di * Hi * Wi; + const long_index_t in_stride_n = C * Di * Hi * Wi; + const long_index_t in_stride_c = Di * Hi * Wi; + const long_index_t in_stride_d = Hi * Wi; + const long_index_t in_stride_h = Wi; + const long_index_t wei_stride_g = K * C * Z * Y * X; + const long_index_t wei_stride_k = C * Z * Y * X; + const long_index_t wei_stride_c = Z * Y * X; + const long_index_t wei_stride_z = Y * X; + const long_index_t wei_stride_y = X; + const long_index_t out_stride_g = N * K * Do * Ho * Wo; + const long_index_t out_stride_n = K * Do * Ho * Wo; + const long_index_t out_stride_k = Do * Ho * Wo; + const long_index_t out_stride_d = Ho * Wo; + const long_index_t out_stride_h = Wo; - // Convert float accumulator to TAcc, then to output type - TAcc acc = type_convert(acc_float); - TOut result = type_convert(acc); + for(long_index_t idx = tid; idx < num_out; idx += num_threads) + { + index_t remaining = idx; + const index_t wo = remaining % Wo; + remaining /= Wo; + const index_t ho = remaining % Ho; + remaining /= Ho; + const index_t do_idx = remaining % Do; + remaining /= Do; + const index_t k = remaining % K; + remaining /= K; + const index_t n = remaining % N; + const index_t g = remaining / N; - // Apply output element-wise operation (if any) - out_op(out_val, result); + float acc = 0.0f; + const InDataType* in_gn = p_in + g * in_stride_g + n * in_stride_n; + const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k; - // Write transformed result - p_out[ii] = out_val; + for(index_t c = 0; c < C; ++c) + { + const InDataType* in_gnc = in_gn + c * in_stride_c; + const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c; + + for(index_t z = 0; z < Z; ++z) + { + long_index_t di = do_idx * stride_z + z * dilation_z - pad_z; + if(di >= 0 && di < Di) + { + const InDataType* in_gncd = in_gnc + di * in_stride_d; + const WeiDataType* wei_gkcz = wei_gkc + z * wei_stride_z; + + for(index_t y = 0; y < Y; ++y) + { + long_index_t hi = ho * stride_y + y * dilation_y - pad_y; + if(hi >= 0 && hi < Hi) + { + const InDataType* in_gncdh = in_gncd + hi * in_stride_h; + const WeiDataType* wei_gkczy = wei_gkcz + y * wei_stride_y; + + for(index_t x = 0; x < X; ++x) + { + long_index_t wi = wo * stride_x + x * dilation_x - pad_x; + if(wi >= 0 && wi < Wi) + { + in_op(in_val, in_gncdh[wi]); + wei_op(wei_val, wei_gkczy[x]); + acc += type_convert(in_val) * + type_convert(wei_val); + } + } + } + } + } + } + } + + OutDataType result = type_convert(acc); + out_op(out_val, result); + p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + do_idx * out_stride_d + + ho * out_stride_h + wo] = out_val; + } } } + +// GPU reference convolution - takes ConvParam directly +template +void naive_conv_fwd(const TIn* p_in, + const TWei* p_wei, + TOut* p_out, + const ck::utils::conv::ConvParam& conv_param, + InElementwiseOperation in_element_op = InElementwiseOperation{}, + WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{}, + OutElementwiseOperation out_element_op = OutElementwiseOperation{}, + hipStream_t stream = nullptr) +{ + const auto ndim = conv_param.num_dim_spatial_; + + const index_t G = conv_param.G_; + const index_t N = conv_param.N_; + const index_t C = conv_param.C_; + const index_t K = conv_param.K_; + + std::vector in_lengths = {G, N, C}; + std::vector wei_lengths = {G, K, C}; + std::vector out_lengths = {G, N, K}; + + for(index_t i = 0; i < ndim; ++i) + { + in_lengths.push_back(static_cast(conv_param.input_spatial_lengths_[i])); + wei_lengths.push_back(static_cast(conv_param.filter_spatial_lengths_[i])); + out_lengths.push_back(static_cast(conv_param.output_spatial_lengths_[i])); + } + + // Calculate total elements for buffer allocation + long_index_t in_total = 1, wei_total = 1, out_total = 1; + for(auto l : in_lengths) + in_total *= l; + for(auto l : wei_lengths) + wei_total *= l; + for(auto l : out_lengths) + out_total *= l; + + // Allocate packed buffers + SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn)); + SimpleDeviceMem wei_packed_buf(wei_total * sizeof(TWei)); + SimpleDeviceMem out_packed_buf(out_total * sizeof(TOut)); + + TIn* p_in_packed = static_cast(in_packed_buf.GetDeviceBuffer()); + TWei* p_wei_packed = static_cast(wei_packed_buf.GetDeviceBuffer()); + TOut* p_out_packed = static_cast(out_packed_buf.GetDeviceBuffer()); + + // Compute strides and allocate device arrays for pack/unpack + std::vector in_strides = compute_conv_tensor_strides(in_lengths, ndim); + std::vector wei_strides = compute_conv_tensor_strides(wei_lengths, ndim); + std::vector out_strides = compute_conv_tensor_strides(out_lengths, ndim); + + const size_t dim_count = in_lengths.size(); + SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t)); + SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t)); + + index_t* d_in_lengths = static_cast(in_lengths_buf.GetDeviceBuffer()); + index_t* d_in_strides = static_cast(in_strides_buf.GetDeviceBuffer()); + index_t* d_wei_lengths = static_cast(wei_lengths_buf.GetDeviceBuffer()); + index_t* d_wei_strides = static_cast(wei_strides_buf.GetDeviceBuffer()); + index_t* d_out_lengths = static_cast(out_lengths_buf.GetDeviceBuffer()); + index_t* d_out_strides = static_cast(out_strides_buf.GetDeviceBuffer()); + + HIP_CHECK_ERROR(hipMemcpy( + d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy( + d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice)); + + // Pack input and weight tensors to contiguous layout + constexpr int block_size = 256; + strided_copy_kernel + <<<(in_total + block_size - 1) / block_size, block_size, 0, stream>>>( + p_in, p_in_packed, d_in_lengths, d_in_strides, dim_count, in_total); + strided_copy_kernel + <<<(wei_total + block_size - 1) / block_size, block_size, 0, stream>>>( + p_wei, p_wei_packed, d_wei_lengths, d_wei_strides, dim_count, wei_total); + + // Build conv parameter vectors for kernel invocation + std::vector conv_strides(ndim); + std::vector conv_dilations(ndim); + std::vector input_pads(ndim); + for(index_t i = 0; i < ndim; ++i) + { + conv_strides[i] = static_cast(conv_param.conv_filter_strides_[i]); + conv_dilations[i] = static_cast(conv_param.conv_filter_dilations_[i]); + input_pads[i] = static_cast(conv_param.input_left_pads_[i]); + } + + // Run convolution kernel on packed data + const int out_grid = (out_total + block_size - 1) / block_size; + + if(ndim == 1) + { + naive_conv_fwd_packed<1, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_packed, + p_out_packed, + G, + N, + K, + C, + 1, + 1, + in_lengths[3], + 1, + 1, + wei_lengths[3], + 1, + 1, + out_lengths[3], + 1, + 1, + conv_strides[0], + 1, + 1, + conv_dilations[0], + 0, + 0, + input_pads[0], + in_element_op, + wei_element_op, + out_element_op); + } + else if(ndim == 2) + { + naive_conv_fwd_packed<2, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_packed, + p_out_packed, + G, + N, + K, + C, + 1, + in_lengths[3], + in_lengths[4], + 1, + wei_lengths[3], + wei_lengths[4], + 1, + out_lengths[3], + out_lengths[4], + 1, + conv_strides[0], + conv_strides[1], + 1, + conv_dilations[0], + conv_dilations[1], + 0, + input_pads[0], + input_pads[1], + in_element_op, + wei_element_op, + out_element_op); + } + else // 3D + { + naive_conv_fwd_packed<3, + TIn, + TWei, + TOut, + InElementwiseOperation, + WeiElementwiseOperation, + OutElementwiseOperation> + <<>>(p_in_packed, + p_wei_packed, + p_out_packed, + G, + N, + K, + C, + in_lengths[3], + in_lengths[4], + in_lengths[5], + wei_lengths[3], + wei_lengths[4], + wei_lengths[5], + out_lengths[3], + out_lengths[4], + out_lengths[5], + conv_strides[0], + conv_strides[1], + conv_strides[2], + conv_dilations[0], + conv_dilations[1], + conv_dilations[2], + input_pads[0], + input_pads[1], + input_pads[2], + in_element_op, + wei_element_op, + out_element_op); + } + + // Unpack + strided_copy_kernel<<>>( + p_out_packed, p_out, d_out_lengths, d_out_strides, dim_count, out_total); + + HIP_CHECK_ERROR(hipGetLastError()); + + // Memory automatically freed by SimpleDeviceMem destructors +} + } // namespace ref } // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp new file mode 100644 index 0000000000..0a7b58b310 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp @@ -0,0 +1,177 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck/ck.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include +#include + +namespace ck { +namespace ref { + +// RAII wrapper for device memory to prevent leaks +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + HIP_CHECK_ERROR(hipMalloc(static_cast(&p_mem_), mem_size)); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +// Helper function to map layout dimension character to index in lengths array +// lengths array structure: [G, N/K, C/K, spatial...] +inline int map_dim_char_to_index(char dim_char, index_t ndim_spatial, bool is_weight) +{ + // G dimension + if(dim_char == 'G') + return 0; + + // Batch/output channels dimension (N for input/output, K for weight's first non-G dim) + if(dim_char == 'N') + return 1; + if(dim_char == 'K' && is_weight) + return 1; + + // Channel dimension (C for input/weight, K for output) + if(dim_char == 'C') + return 2; + if(dim_char == 'K' && !is_weight) + return 2; + + // Spatial dimensions - map based on ndim_spatial + // Input/Output use: D/H/W, Weight uses: Z/Y/X + if(ndim_spatial == 1) + { + if(dim_char == 'W' || dim_char == 'X') + return 3; + } + else if(ndim_spatial == 2) + { + if(dim_char == 'H' || dim_char == 'Y') + return 3; + if(dim_char == 'W' || dim_char == 'X') + return 4; + } + else if(ndim_spatial == 3) + { + if(dim_char == 'D' || dim_char == 'Z') + return 3; + if(dim_char == 'H' || dim_char == 'Y') + return 4; + if(dim_char == 'W' || dim_char == 'X') + return 5; + } + + // Should not reach here + return -1; +} + +// Template function to compute layout-aware strides based on layout name +// The layout name directly encodes memory ordering from left to right +template +inline std::vector compute_conv_tensor_strides(const std::vector& lengths, + index_t ndim_spatial) +{ + constexpr const char* layout_name = Layout::name; + const int num_dims = static_cast(lengths.size()); + std::vector strides(num_dims, 0); + + // Determine if this is a weight tensor (has 'K' but not 'N') + bool has_k = false; + bool has_n = false; + for(const char* p = layout_name; *p != '\0'; ++p) + { + if(*p == 'K') + has_k = true; + if(*p == 'N') + has_n = true; + } + bool is_weight = has_k && !has_n; + + // Build dimension ordering from layout name (parse string) + std::vector dim_order; + const char dim_chars[] = {'G', 'N', 'K', 'C', 'D', 'H', 'W', 'X', 'Y', 'Z'}; + for(const char* p = layout_name; *p != '\0'; ++p) + { + char c = *p; + // Skip underscores (strided layouts) + if(c == '_') + continue; + // Valid dimension characters + if(std::find(std::begin(dim_chars), std::end(dim_chars), c) != std::end(dim_chars)) + { + dim_order.push_back(c); + } + } + + // Compute strides: process from right to left (innermost to outermost) + index_t stride = 1; + for(int i = static_cast(dim_order.size()) - 1; i >= 0; --i) + { + char dim_char = dim_order[i]; + int length_idx = map_dim_char_to_index(dim_char, ndim_spatial, is_weight); + + if(length_idx >= 0 && length_idx < num_dims) + { + strides[length_idx] = stride; + stride *= lengths[length_idx]; + } + } + + return strides; +} + +// Unified kernel for strided tensor copy operations +// IsUnpack=false: Pack strided -> contiguous +// IsUnpack=true: Unpack contiguous -> strided +template +__global__ void strided_copy_kernel(const DataType* __restrict__ src, + DataType* __restrict__ dst, + const index_t* tensor_lengths, + const index_t* strided_strides, + int num_dims, + long_index_t total_elements) +{ + const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const long_index_t num_threads = blockDim.x * gridDim.x; + + for(long_index_t linear_idx = tid; linear_idx < total_elements; linear_idx += num_threads) + { + // Compute strided index from linear index + long_index_t remaining = linear_idx; + long_index_t strided_idx = 0; + + for(int dim = num_dims - 1; dim >= 0; --dim) + { + index_t coord = remaining % tensor_lengths[dim]; + remaining /= tensor_lengths[dim]; + strided_idx += coord * strided_strides[dim]; + } + + // Direction determines which is src and which is dst + if constexpr(IsUnpack) + { + // Unpack: src is contiguous (linear_idx), dst is strided (strided_idx) + dst[strided_idx] = src[linear_idx]; + } + else + { + // Pack: src is strided (strided_idx), dst is contiguous (linear_idx) + dst[linear_idx] = src[strided_idx]; + } + } +} + +} // namespace ref +} // namespace ck diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index b9e463dc1e..67d082d07b 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -18,6 +18,7 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp" namespace ck { @@ -89,8 +90,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, wei_device_buf.ToDevice(wei.mData.data()); float max_accumulated_value = 0; - if(do_verification) + if(do_verification == 2) { + // Use GPU reference for verification + std::cout << "Using GPU reference for verification" << std::endl; + + // Allocate GPU reference output buffer + DeviceMem gpu_ref_in_buf(sizeof(InDataType) * in_host.mDesc.GetElementSpaceSize()); + + // Call GPU reference with ConvParam directly + ref::naive_conv_bwd_data( + reinterpret_cast(gpu_ref_in_buf.GetDeviceBuffer()), + reinterpret_cast(wei_device_buf.GetDeviceBuffer()), + reinterpret_cast(out_device_buf.GetDeviceBuffer()), + conv_param, + in_element_op, + wei_element_op, + out_element_op); + + // Copy GPU reference result to host for comparison + gpu_ref_in_buf.FromDevice(in_host.mData.data()); + max_accumulated_value = *std::max_element(in_host.mData.begin(), in_host.mData.end()); + } + else if(do_verification == 1) + { + // Use CPU reference for verification (default) auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData{}; - auto ref_invoker = ref_conv.MakeInvoker(); - auto ref_argument = ref_conv.MakeArgument(input, - weight_host_result, - output, - conv_param.conv_filter_strides_, - conv_param.conv_filter_dilations_, - conv_param.input_left_pads_, - conv_param.input_right_pads_, - in_element_op, - wei_element_op, - out_element_op, - {}, - {}, - {}); + if(do_verification == 1) + { + // CPU reference + auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight{}; + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(input, + weight_host_result, + output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + in_element_op, + wei_element_op, + out_element_op, + {}, + {}, + {}); + + ref_invoker.Run(ref_argument); + } + else if(do_verification == 2) + { + // GPU reference + std::cout << "Running GPU reference implementation..." << std::endl; + + // Allocate device memory for reference + DeviceMem in_ref_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); + DeviceMem wei_ref_buf(sizeof(WeiDataType) * + weight_host_result.mDesc.GetElementSpaceSize()); + DeviceMem out_ref_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize()); + + in_ref_buf.ToDevice(input.mData.data()); + out_ref_buf.ToDevice(output.mData.data()); + + // Call GPU reference with ConvParam directly + ck::ref::naive_conv_bwd_weight( + static_cast(in_ref_buf.GetDeviceBuffer()), + static_cast(wei_ref_buf.GetDeviceBuffer()), + static_cast(out_ref_buf.GetDeviceBuffer()), + conv_param, + in_element_op, + wei_element_op, + out_element_op); + + // Copy result back to host + wei_ref_buf.FromDevice(weight_host_result.mData.data()); + } - ref_invoker.Run(ref_argument); max_accumulated_value = *std::max_element(weight_host_result.mData.begin(), weight_host_result.mData.end()); } diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index 427d2b14df..aeed6f4f06 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -22,6 +22,7 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" namespace ck { namespace profiler { @@ -113,8 +114,38 @@ bool profile_grouped_conv_fwd_impl(int do_verification, wei_device_buf.ToDevice(weight.mData.data()); // run reference op - if(do_verification) + if(do_verification == 2) { + // Use GPU reference for verification + std::cout << "Using GPU reference for verification" << std::endl; + + // Allocate GPU reference output buffer + DeviceMem gpu_ref_out_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); + + // Call GPU reference with ConvParam directly + ref::naive_conv_fwd( + reinterpret_cast(in_device_buf.GetDeviceBuffer()), + reinterpret_cast(wei_device_buf.GetDeviceBuffer()), + reinterpret_cast(gpu_ref_out_buf.GetDeviceBuffer()), + conv_param, + in_element_op, + wei_element_op, + out_element_op); + + // Copy GPU reference result to host for comparison + gpu_ref_out_buf.FromDevice(host_output.mData.data()); + } + else if(do_verification == 1) + { + // Use CPU reference for verification (default) auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" + +using InDataType = float; +using WeiDataType = float; +using OutDataType = float; +using AccDataType = float; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::PassThrough; + +using DeviceConvNaive = ck::tensor_operation::device:: + DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K; + +template +bool run_conv3d_naive_test(const ck::utils::conv::ConvParam& conv_param) +{ + using namespace ck; + using namespace ck::tensor_operation::host; + + using InLayout = ck::tensor_layout::convolution::GNCDHW; + using WeiLayout = ck::tensor_layout::convolution::GKCZYX; + using OutLayout = ck::tensor_layout::convolution::GNKDHW; + + const auto in_g_n_c_wis_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + const auto wei_g_k_c_xs_desc = + ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + const auto out_g_n_k_wos_desc = + ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); + + // Initialize tensors + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + + DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + + // Run device kernel - convert long_index_t vectors to index_t + std::vector input_spatial_lengths(conv_param.input_spatial_lengths_.begin(), + conv_param.input_spatial_lengths_.end()); + std::vector filter_spatial_lengths(conv_param.filter_spatial_lengths_.begin(), + conv_param.filter_spatial_lengths_.end()); + auto output_spatial_lengths_long = conv_param.GetOutputSpatialLengths(); + std::vector output_spatial_lengths(output_spatial_lengths_long.begin(), + output_spatial_lengths_long.end()); + std::vector conv_filter_strides(conv_param.conv_filter_strides_.begin(), + conv_param.conv_filter_strides_.end()); + std::vector conv_filter_dilations(conv_param.conv_filter_dilations_.begin(), + conv_param.conv_filter_dilations_.end()); + std::vector input_left_pads(conv_param.input_left_pads_.begin(), + conv_param.input_left_pads_.end()); + std::vector input_right_pads(conv_param.input_right_pads_.begin(), + conv_param.input_right_pads_.end()); + + auto conv = DeviceConvNaive{}; + auto invoker = conv.MakeInvoker(); + auto argument = + conv.MakeArgument(static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + conv_param.N_, + conv_param.K_, + conv_param.C_, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + if(!conv.IsSupportedArgument(argument)) + { + std::cout << "Unsupported argument for naive conv3d kernel" << std::endl; + return false; + } + + invoker.Run(argument, StreamConfig{nullptr, false}); + + // Run CPU reference + auto ref_conv = ReferenceConvFwd(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in, + wei, + out_host, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + ref_invoker.Run(ref_argument); + + // Compare results + out_device_buf.FromDevice(out_device.mData.data()); + + return ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-3, 1e-3); +} + +TEST(TestConv3dNaive, Conv3dNaive_Small) +{ + // Small 3D convolution test + ck::utils::conv::ConvParam param{ + 3, // spatial_dim + 1, // G + 2, // N + 16, // K + 16, // C + {3, 3, 3}, // filter + {7, 7, 7}, // input spatial + {2, 2, 2}, // strides + {1, 1, 1}, // dilations + {1, 1, 1}, // left pads + {1, 1, 1} // right pads + }; + + bool pass = run_conv3d_naive_test<3>(param); + EXPECT_TRUE(pass); +} + +TEST(TestConv3dNaive, Conv3dNaive_Medium) +{ + // Medium size 3D convolution test + ck::utils::conv::ConvParam param{ + 3, // spatial_dim + 1, // G + 4, // N + 32, // K + 32, // C + {3, 3, 3}, // filter + {14, 14, 14}, // input spatial + {1, 1, 1}, // strides + {1, 1, 1}, // dilations + {1, 1, 1}, // left pads + {1, 1, 1} // right pads + }; + + bool pass = run_conv3d_naive_test<3>(param); + EXPECT_TRUE(pass); +} + +TEST(TestConv3dNaive, Conv3dNaive_UnitFilter) +{ + // 1x1x1 filter (no padding) + ck::utils::conv::ConvParam param{ + 3, // spatial_dim + 1, // G + 2, // N + 24, // K + 24, // C + {1, 1, 1}, // filter + {8, 8, 8}, // input spatial + {1, 1, 1}, // strides + {1, 1, 1}, // dilations + {0, 0, 0}, // left pads + {0, 0, 0} // right pads + }; + + bool pass = run_conv3d_naive_test<3>(param); + EXPECT_TRUE(pass); +} + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/gpu_reference/CMakeLists.txt b/test/gpu_reference/CMakeLists.txt new file mode 100644 index 0000000000..443818feb3 --- /dev/null +++ b/test/gpu_reference/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +add_gtest_executable(test_gpu_reference_conv_fwd test_gpu_reference_conv_fwd.cpp) +target_link_libraries(test_gpu_reference_conv_fwd PRIVATE utility) + +add_gtest_executable(test_gpu_reference_conv_bwd_data test_gpu_reference_conv_bwd_data.cpp) +target_link_libraries(test_gpu_reference_conv_bwd_data PRIVATE utility) + +add_gtest_executable(test_gpu_reference_conv_bwd_weight test_gpu_reference_conv_bwd_weight.cpp) +target_link_libraries(test_gpu_reference_conv_bwd_weight PRIVATE utility) diff --git a/test/gpu_reference/common_test_params.hpp b/test/gpu_reference/common_test_params.hpp new file mode 100644 index 0000000000..bcd638d0cb --- /dev/null +++ b/test/gpu_reference/common_test_params.hpp @@ -0,0 +1,137 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck/ck.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include + +namespace ck { +namespace test { + +// Common test shapes for all convolution tests (fwd, bwd_data, bwd_weight) +namespace conv_test_shapes { + +// 2D Conv, FP16, Small +inline ck::utils::conv::ConvParam get_2d_small() +{ + return ck::utils::conv::ConvParam(2, // num_dim_spatial + 1, // G + 2, // N + 8, // K + 8, // C + {3, 3}, // filter_spatial + {7, 7}, // input_spatial + {1, 1}, // strides + {1, 1}, // dilations + {0, 0}, // left_pads + {0, 0} // right_pads + ); +} + +// 2D Conv, FP32, Medium +inline ck::utils::conv::ConvParam get_2d_medium() +{ + return ck::utils::conv::ConvParam(2, // num_dim_spatial + 1, // G + 4, // N + 16, // K + 16, // C + {3, 3}, // filter_spatial + {14, 14}, // input_spatial + {1, 1}, // strides + {1, 1}, // dilations + {0, 0}, // left_pads + {0, 0} // right_pads + ); +} + +// 1D Conv, FP16 +inline ck::utils::conv::ConvParam get_1d() +{ + return ck::utils::conv::ConvParam(1, // num_dim_spatial + 1, // G + 2, // N + 8, // K + 8, // C + {3}, // filter_spatial + {16}, // input_spatial + {1}, // strides + {1}, // dilations + {0}, // left_pads + {0} // right_pads + ); +} + +// 3D Conv, FP16, Small +inline ck::utils::conv::ConvParam get_3d_small() +{ + return ck::utils::conv::ConvParam(3, // num_dim_spatial + 1, // G + 1, // N + 8, // K + 8, // C + {3, 3, 3}, // filter_spatial + {5, 5, 5}, // input_spatial + {1, 1, 1}, // strides + {1, 1, 1}, // dilations + {0, 0, 0}, // left_pads + {0, 0, 0} // right_pads + ); +} + +// 2D Conv with stride +inline ck::utils::conv::ConvParam get_2d_stride2() +{ + return ck::utils::conv::ConvParam(2, // num_dim_spatial + 1, // G + 2, // N + 8, // K + 8, // C + {3, 3}, // filter_spatial + {8, 8}, // input_spatial + {2, 2}, // strides + {1, 1}, // dilations + {0, 0}, // left_pads + {0, 0} // right_pads + ); +} + +// 2D Grouped Conv, FP16, G=2 +inline ck::utils::conv::ConvParam get_2d_grouped_g2() +{ + return ck::utils::conv::ConvParam(2, // num_dim_spatial + 2, // G + 2, // N + 8, // K (8 total output channels) + 16, // C (16 total input channels, 8 per group with G=2) + {3, 3}, // filter_spatial + {7, 7}, // input_spatial + {1, 1}, // strides + {1, 1}, // dilations + {0, 0}, // left_pads + {0, 0} // right_pads + ); +} + +// 2D Grouped Conv, FP32, G=4 +inline ck::utils::conv::ConvParam get_2d_grouped_g4() +{ + return ck::utils::conv::ConvParam(2, // num_dim_spatial + 4, // G + 1, // N + 16, // K (16 total output channels) + 16, // C (16 total input channels, 4 per group with G=4) + {3, 3}, // filter_spatial + {8, 8}, // input_spatial + {1, 1}, // strides + {1, 1}, // dilations + {0, 0}, // left_pads + {0, 0} // right_pads + ); +} + +} // namespace conv_test_shapes +} // namespace test +} // namespace ck diff --git a/test/gpu_reference/gpu_reference_utils.hpp b/test/gpu_reference/gpu_reference_utils.hpp new file mode 100644 index 0000000000..fc017c8734 --- /dev/null +++ b/test/gpu_reference/gpu_reference_utils.hpp @@ -0,0 +1,385 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck/ck.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" + +// CPU references +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp" + +// GPU references +#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +#include "common_test_params.hpp" + +namespace ck { +namespace test { + +enum class ConvKernelType +{ + Forward, + BackwardData, + BackwardWeight +}; + +// Helper function to initialize and copy a tensor to device +template +void initialize_and_copy_tensor(Tensor& host_tensor, DeviceMem& device_mem) +{ + host_tensor.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + device_mem.ToDevice(host_tensor.mData.data()); +} + +// Helper to get default layout types based on NDimSpatial +template +struct DefaultConvLayouts +{ + using InLayout = std::conditional_t>; + using WeiLayout = std::conditional_t>; + using OutLayout = std::conditional_t>; +}; + +// Forward convolution implementation +template +bool test_conv_fwd_impl(const ck::utils::conv::ConvParam& params, + const Tensor& input_cpu, + const Tensor& weight_cpu, + DeviceMem& input_dev, + DeviceMem& weight_dev, + DeviceMem& output_dev) +{ + using InElementOp = tensor_operation::element_wise::PassThrough; + using WeiElementOp = tensor_operation::element_wise::PassThrough; + using OutElementOp = tensor_operation::element_wise::PassThrough; + + // Call GPU reference with ConvParam directly + ref::naive_conv_fwd( + reinterpret_cast(input_dev.GetDeviceBuffer()), + reinterpret_cast(weight_dev.GetDeviceBuffer()), + reinterpret_cast(output_dev.GetDeviceBuffer()), + params); + + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + // Run CPU reference + std::vector strides_long(params.conv_filter_strides_.begin(), + params.conv_filter_strides_.end()); + std::vector dilations_long(params.conv_filter_dilations_.begin(), + params.conv_filter_dilations_.end()); + std::vector pads_long(params.input_left_pads_.begin(), + params.input_left_pads_.end()); + + Tensor input_ref = input_cpu; + Tensor weight_ref = weight_cpu; + Tensor output_ref( + ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(params)); + + auto ref_conv = tensor_operation::host::ReferenceConvFwd(); + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_arg = ref_conv.MakeArgument(input_ref, + weight_ref, + output_ref, + strides_long, + dilations_long, + pads_long, + pads_long, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + ref_invoker.Run(ref_arg); + + // Copy result from device and compare + Tensor output_gpu(output_ref.mDesc); + output_dev.FromDevice(output_gpu.mData.data()); + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + // Compare results + return ck::utils::check_err(output_gpu, output_ref); +} + +// Backward data convolution implementation +template +bool test_conv_bwd_data_impl(const ck::utils::conv::ConvParam& params, + const Tensor& weight_cpu, + const Tensor& output_cpu, + DeviceMem& weight_dev, + DeviceMem& output_dev, + DeviceMem& input_dev) +{ + using InElementOp = tensor_operation::element_wise::PassThrough; + using WeiElementOp = tensor_operation::element_wise::PassThrough; + using OutElementOp = tensor_operation::element_wise::PassThrough; + + // Call GPU reference with ConvParam directly + ref::naive_conv_bwd_data( + reinterpret_cast(input_dev.GetDeviceBuffer()), + reinterpret_cast(weight_dev.GetDeviceBuffer()), + reinterpret_cast(output_dev.GetDeviceBuffer()), + params); + + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + // Run CPU reference + std::vector strides_long(params.conv_filter_strides_.begin(), + params.conv_filter_strides_.end()); + std::vector dilations_long(params.conv_filter_dilations_.begin(), + params.conv_filter_dilations_.end()); + std::vector pads_long(params.input_left_pads_.begin(), + params.input_left_pads_.end()); + + Tensor input_ref( + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(params)); + Tensor weight_ref = weight_cpu; + Tensor output_ref = output_cpu; + + auto ref_conv = tensor_operation::host::ReferenceConvBwdData(); + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_arg = ref_conv.MakeArgument(input_ref, + weight_ref, + output_ref, + strides_long, + dilations_long, + pads_long, + pads_long, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + ref_invoker.Run(ref_arg); + + // Copy result from device and compare + Tensor input_gpu(input_ref.mDesc); + input_dev.FromDevice(input_gpu.mData.data()); + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + // Compare results + return ck::utils::check_err(input_gpu, input_ref); +} + +// Backward weight convolution implementation +template +bool test_conv_bwd_weight_impl(const ck::utils::conv::ConvParam& params, + const Tensor& input_cpu, + const Tensor& output_cpu, + DeviceMem& input_dev, + DeviceMem& output_dev, + DeviceMem& weight_dev) +{ + using InElementOp = tensor_operation::element_wise::PassThrough; + using WeiElementOp = tensor_operation::element_wise::PassThrough; + using OutElementOp = tensor_operation::element_wise::PassThrough; + + // Call GPU reference with ConvParam directly + ref::naive_conv_bwd_weight( + reinterpret_cast(input_dev.GetDeviceBuffer()), + reinterpret_cast(weight_dev.GetDeviceBuffer()), + reinterpret_cast(output_dev.GetDeviceBuffer()), + params); + + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + // Run CPU reference + std::vector strides_long(params.conv_filter_strides_.begin(), + params.conv_filter_strides_.end()); + std::vector dilations_long(params.conv_filter_dilations_.begin(), + params.conv_filter_dilations_.end()); + std::vector pads_long(params.input_left_pads_.begin(), + params.input_left_pads_.end()); + + Tensor input_ref = input_cpu; + Tensor weight_ref( + ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(params)); + Tensor output_ref = output_cpu; + + auto ref_conv = tensor_operation::host::ReferenceConvBwdWeight(); + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_arg = ref_conv.MakeArgument(input_ref, + weight_ref, + output_ref, + strides_long, + dilations_long, + pads_long, + pads_long, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + ref_invoker.Run(ref_arg); + + // Copy result from device and compare + Tensor weight_gpu(weight_ref.mDesc); + weight_dev.FromDevice(weight_gpu.mData.data()); + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + // Compare results + return ck::utils::check_err(weight_gpu, weight_ref); +} + +// Main test function - dispatches to specific implementations +template ::InLayout, + typename WeiLayout = typename DefaultConvLayouts::WeiLayout, + typename OutLayout = typename DefaultConvLayouts::OutLayout> +bool test_conv_gpu_ref(const ck::utils::conv::ConvParam& params, ConvKernelType kernel_type) +{ + // Create tensor descriptors using the specified layouts + const auto in_g_n_c_wis_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(params); + + const auto wei_g_k_c_xs_desc = + ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(params); + + const auto out_g_n_k_wos_desc = + ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(params); + + // Create tensors using tensor descriptors (supports multiple layouts) + Tensor input(in_g_n_c_wis_desc); + Tensor weight(wei_g_k_c_xs_desc); + Tensor output(out_g_n_k_wos_desc); + + // Allocate device memory + DeviceMem input_dev(input.mData.size() * sizeof(InDataType)); + DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType)); + DeviceMem output_dev(output.mData.size() * sizeof(OutDataType)); + + // Initialize and copy tensors based on kernel type + if(kernel_type == ConvKernelType::Forward) + { + initialize_and_copy_tensor(input, input_dev); + initialize_and_copy_tensor(weight, weight_dev); + } + else if(kernel_type == ConvKernelType::BackwardData) + { + initialize_and_copy_tensor(weight, weight_dev); + initialize_and_copy_tensor(output, output_dev); + } + else // BackwardWeight + { + initialize_and_copy_tensor(input, input_dev); + initialize_and_copy_tensor(output, output_dev); + } + + // Dispatch to appropriate implementation with layout types + if(kernel_type == ConvKernelType::Forward) + { + return test_conv_fwd_impl( + params, input, weight, input_dev, weight_dev, output_dev); + } + else if(kernel_type == ConvKernelType::BackwardData) + { + return test_conv_bwd_data_impl( + params, weight, output, weight_dev, output_dev, input_dev); + } + else // BackwardWeight + { + return test_conv_bwd_weight_impl( + params, input, output, input_dev, output_dev, weight_dev); + } +} + +} // namespace test +} // namespace ck diff --git a/test/gpu_reference/test_gpu_reference_conv_bwd_data.cpp b/test/gpu_reference/test_gpu_reference_conv_bwd_data.cpp new file mode 100644 index 0000000000..0d69a9e77b --- /dev/null +++ b/test/gpu_reference/test_gpu_reference_conv_bwd_data.cpp @@ -0,0 +1,224 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include "gpu_reference_utils.hpp" + +using namespace ck; +using ck::test::ConvKernelType; + +TEST(GpuReferenceConvBwdData, Conv2DFP16Small) +{ + auto params = test::conv_test_shapes::get_2d_small(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP32Medium) +{ + auto params = test::conv_test_shapes::get_2d_medium(); + bool result = + test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv1DFP16) +{ + auto params = test::conv_test_shapes::get_1d(); + bool result = + test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv3DFP16Small) +{ + auto params = test::conv_test_shapes::get_3d_small(); + bool result = + test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP16Stride2) +{ + auto params = test::conv_test_shapes::get_2d_stride2(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedG2) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedG4) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g4(); + bool result = + test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NHWGC, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NHWGK>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NHWGC, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NHWGK>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NGKHW>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NGKHW>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKCYX, + tensor_layout::convolution::NGKHW>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKCYX, + tensor_layout::convolution::NGKHW>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + float, + float, + float, + tensor_layout::convolution::NDHWGC, + tensor_layout::convolution::GKZYXC, + tensor_layout::convolution::NDHWGK>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + half_t, + half_t, + half_t, + tensor_layout::convolution::NDHWGC, + tensor_layout::convolution::GKZYXC, + tensor_layout::convolution::NDHWGK>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + float, + float, + float, + tensor_layout::convolution::NGCDHW, + tensor_layout::convolution::GKCZYX, + tensor_layout::convolution::NGKDHW>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdData, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCDHW, + tensor_layout::convolution::GKCZYX, + tensor_layout::convolution::NGKDHW>( + params, ConvKernelType::BackwardData); + EXPECT_TRUE(result); +} diff --git a/test/gpu_reference/test_gpu_reference_conv_bwd_weight.cpp b/test/gpu_reference/test_gpu_reference_conv_bwd_weight.cpp new file mode 100644 index 0000000000..b373052a0f --- /dev/null +++ b/test/gpu_reference/test_gpu_reference_conv_bwd_weight.cpp @@ -0,0 +1,224 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include "gpu_reference_utils.hpp" + +using namespace ck; +using ck::test::ConvKernelType; + +TEST(GpuReferenceConvBwdWeight, Conv2DFP16Small) +{ + auto params = test::conv_test_shapes::get_2d_small(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP32Medium) +{ + auto params = test::conv_test_shapes::get_2d_medium(); + bool result = + test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv1DFP16) +{ + auto params = test::conv_test_shapes::get_1d(); + bool result = + test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv3DFP16Small) +{ + auto params = test::conv_test_shapes::get_3d_small(); + bool result = + test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP16Stride2) +{ + auto params = test::conv_test_shapes::get_2d_stride2(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedG2) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedG4) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g4(); + bool result = + test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NHWGC, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NHWGK>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NHWGC, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NHWGK>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NGKHW>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NGKHW>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKCYX, + tensor_layout::convolution::NGKHW>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKCYX, + tensor_layout::convolution::NGKHW>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + float, + float, + float, + tensor_layout::convolution::NDHWGC, + tensor_layout::convolution::GKZYXC, + tensor_layout::convolution::NDHWGK>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + half_t, + half_t, + half_t, + tensor_layout::convolution::NDHWGC, + tensor_layout::convolution::GKZYXC, + tensor_layout::convolution::NDHWGK>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + float, + float, + float, + tensor_layout::convolution::NGCDHW, + tensor_layout::convolution::GKCZYX, + tensor_layout::convolution::NGKDHW>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvBwdWeight, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCDHW, + tensor_layout::convolution::GKCZYX, + tensor_layout::convolution::NGKDHW>( + params, ConvKernelType::BackwardWeight); + EXPECT_TRUE(result); +} diff --git a/test/gpu_reference/test_gpu_reference_conv_fwd.cpp b/test/gpu_reference/test_gpu_reference_conv_fwd.cpp new file mode 100644 index 0000000000..1182922a58 --- /dev/null +++ b/test/gpu_reference/test_gpu_reference_conv_fwd.cpp @@ -0,0 +1,222 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include "gpu_reference_utils.hpp" + +using namespace ck; +using ck::test::ConvKernelType; + +TEST(GpuReferenceConvFwd, Conv2DFP16Small) +{ + auto params = test::conv_test_shapes::get_2d_small(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP32Medium) +{ + auto params = test::conv_test_shapes::get_2d_medium(); + bool result = test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv1DFP16) +{ + auto params = test::conv_test_shapes::get_1d(); + bool result = + test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv3DFP16Small) +{ + auto params = test::conv_test_shapes::get_3d_small(); + bool result = + test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP16Stride2) +{ + auto params = test::conv_test_shapes::get_2d_stride2(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP16GroupedG2) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP32GroupedG4) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g4(); + bool result = test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NHWGC, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NHWGK>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NHWGC, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NHWGK>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKYXC, + tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, + float, + float, + float, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKCYX, + tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW) +{ + auto params = test::conv_test_shapes::get_2d_grouped_g2(); + bool result = + test::test_conv_gpu_ref<2, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCHW, + tensor_layout::convolution::GKCYX, + tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + float, + float, + float, + tensor_layout::convolution::NDHWGC, + tensor_layout::convolution::GKZYXC, + tensor_layout::convolution::NDHWGK>( + params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + half_t, + half_t, + half_t, + tensor_layout::convolution::NDHWGC, + tensor_layout::convolution::GKZYXC, + tensor_layout::convolution::NDHWGK>( + params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + float, + float, + float, + tensor_layout::convolution::NGCDHW, + tensor_layout::convolution::GKCZYX, + tensor_layout::convolution::NGKDHW>( + params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} + +TEST(GpuReferenceConvFwd, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW) +{ + auto params = test::conv_test_shapes::get_3d_small(); + // Modify to be grouped (G=2) + params.G_ = 2; + params.C_ = 16; // 8 per group + params.K_ = 16; // 8 per group + + bool result = test::test_conv_gpu_ref<3, + half_t, + half_t, + half_t, + tensor_layout::convolution::NGCDHW, + tensor_layout::convolution::GKCZYX, + tensor_layout::convolution::NGKDHW>( + params, ConvKernelType::Forward); + EXPECT_TRUE(result); +} diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp index 53b8ec32af..a30f5d349d 100644 --- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp +++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp @@ -73,7 +73,7 @@ bool RunConvBwdDataTest(const ck::utils::conv::ConvParam& param, ck::index_t spl InLayout, DataType, DataType, - DataType>(true, // do_verification + DataType>(2, // do_verification 1, // init_method false, // do_log false, // time_kernel diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp index 14ed1b8939..efedf416f0 100644 --- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp +++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp @@ -47,7 +47,7 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test DataType, DataType, DataType>( - true, // do_verification + 2, // do_verification 1, // init_method: integer value false, // do_log false, // time_kernel diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp index 4b5e38dea6..5d56615834 100644 --- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp +++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp @@ -73,7 +73,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test InDataType, WeiDataType, OutDataType>( - true, // do_verification + 2, // do_verification 1, // init_method: integer value false, // do_log false, // time_kernel diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp index aff6ba8873..07d80dfad2 100644 --- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp +++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp @@ -80,7 +80,7 @@ bool RunConvBwdWeightTest(const ck::utils::conv::ConvParam& param, ck::index_t s InDataType, WeiDataType, OutDataType>( - true, // do_verification + 2, // do_verification 1, // init_method false, // do_log false, // time_kernel diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp index b5c5248df5..e1207b1133 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp @@ -46,7 +46,7 @@ class TestGroupedConvndFwd : public ::testing::Test DataType, DataType, IndexType>( - true, // do_verification + 2, // do_verification 1, // init_method: integer value false, // do_log false, // time_kernel diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp index c99f7ccf2f..8bfdbabd54 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp @@ -77,7 +77,7 @@ bool RunConvTest(const ck::utils::conv::ConvParam& param) DataType, DataType, DataType, - IndexType>(true, // do_verification + IndexType>(2, // do_verification 1, // init_method false, // do_log false, // time_kernel