diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp index 93c0a4c5cb..316dcadb7b 100644 --- a/example/09_convnd_fwd/convnd_fwd_common.hpp +++ b/example/09_convnd_fwd/convnd_fwd_common.hpp @@ -18,6 +18,8 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" +#include "ck_tile/host/hip_check_error.hpp" using ::ck::DeviceMem; using ::ck::HostTensorDescriptor; @@ -25,7 +27,7 @@ using ::ck::Tensor; void print_helper_msg() { - std::cout << "arg1: verification (0=no, 1=yes)\n" + std::cout << "arg1: verification (0=no, 1=CPU, 2=GPU)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" << "arg3: time kernel (0=no, 1=yes)\n" << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; @@ -130,7 +132,7 @@ template -bool run_grouped_conv_fwd(bool do_verification, +bool run_grouped_conv_fwd(int do_verification, int init_method, bool time_kernel, const ck::utils::conv::ConvParam& conv_param, @@ -233,8 +235,11 @@ bool run_grouped_conv_fwd(bool do_verification, std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << conv.GetTypeString() << std::endl; - if(do_verification) + std::cout << "do_verification = " << do_verification << std::endl; + + if(do_verification == 1) { + // CPU verification auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd(), get_atol()); } + else if(do_verification == 2) + { + // GPU verification using naive GPU reference + std::cout << "Running GPU verification..." << std::endl; + + // Allocate and ZERO GPU memory for reference output + DeviceMem out_device_ref_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); + out_device_ref_buf.SetZero(); + + // Extract dimensions using helper function + ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial); + + // Launch GPU reference kernel + constexpr ck::index_t block_size = 256; + const ck::long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K; + const ck::index_t grid_size = (output_length + block_size - 1) / block_size; + + auto gpu_ref_kernel = ck::ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk; + + gpu_ref_kernel<<>>( + reinterpret_cast(in_device_buf.GetDeviceBuffer()), + reinterpret_cast(wei_device_buf.GetDeviceBuffer()), + reinterpret_cast(out_device_ref_buf.GetDeviceBuffer()), + dims); + + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + std::cout << "GPU reference kernel completed successfully, copying results..." << std::endl; + + // Copy GPU reference result to host + out_device_ref_buf.FromDevice(out_host.mData.data()); + + // Copy GPU kernel result to host + out_device_buf.FromDevice(out_device.mData.data()); + + std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl; + + // Compare GPU kernel vs GPU reference + bool pass = ck::utils::check_err(out_device, + out_host, + "Error: incorrect results!", + get_rtol(), + get_atol()); + + std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + + return pass; + } return true; } diff --git a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp index cfaacab64b..1dc71a16ef 100644 --- a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp +++ b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp @@ -25,7 +25,7 @@ using ::ck::Tensor; void print_helper_msg() { - std::cout << "arg1: verification (0=no, 1=yes)\n" + std::cout << "arg1: verification (0=no, 1=CPU)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" << "arg3: time kernel (0=no, 1=yes)\n" << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; @@ -162,6 +162,7 @@ bool run_grouped_conv_fwd_dl(bool do_verification, if(do_verification) { + // CPU verification only (DL variants are fused operations) auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd< NDimSpatial, InDataType, diff --git a/example/09_convnd_fwd/run_convnd_fwd_example.inc b/example/09_convnd_fwd/run_convnd_fwd_example.inc index 14c3d14ed9..4c394821a8 100644 --- a/example/09_convnd_fwd/run_convnd_fwd_example.inc +++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc @@ -12,9 +12,9 @@ bool run_convnd_fwd_example(int argc, char* argv[]) { print_helper_msg(); - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; + int do_verification = 1; // 0=no, 1=CPU, 2=GPU + int init_method = 1; + bool time_kernel = false; ck::utils::conv::ConvParam conv_param{ 2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp index 80299fd3d7..f278107c9e 100644 --- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp +++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp @@ -17,14 +17,58 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp" +#include "ck_tile/host/hip_check_error.hpp" using ::ck::DeviceMem; using ::ck::HostTensorDescriptor; using ::ck::Tensor; +template +inline __host__ __device__ constexpr double get_rtol() +{ + if constexpr(std::is_same_v && std::is_same_v) + return 5e-3; + else if constexpr(std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 1e-6; + else if constexpr(std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 5e-2; + else if constexpr(std::is_same_v) + return 1e-1; + else if constexpr(std::is_same_v) + return 1.5e-1; + else + return 1e-3; +} + +template +inline __host__ __device__ constexpr double get_atol() +{ + if constexpr(std::is_same_v && std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 1e-6; + else if constexpr(std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 5e-2; + else if constexpr(std::is_same_v) + return 16.1; + else if constexpr(std::is_same_v) + return 16.1; + else + return 1e-3; +} + void print_helper_msg() { - std::cout << "arg1: verification (0=no, 1=yes)\n" + std::cout << "arg1: verification (0=no, 1=CPU, 2=GPU)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" << "arg3: time kernel (0=no, 1=yes)\n" << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; @@ -38,7 +82,7 @@ template -int run_conv_bwd_data(bool do_verification, +int run_conv_bwd_data(int do_verification, int init_method, bool time_kernel, const ck::utils::conv::ConvParam& conv_param, @@ -128,26 +172,30 @@ int run_conv_bwd_data(bool do_verification, wei_element_op, out_element_op); + // Check if optimized kernel supports these parameters if(!conv.IsSupportedArgument(argument.get())) { std::cout << "Not support,please check parameters or device"; return 0; } + // Run optimized kernel float ave_time = invoker.Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = conv_param.GetFlops(); std::size_t num_btype = conv_param.GetByte(); - float tflops = static_cast(flop) / 1.E9 / ave_time; - + float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl; - if(do_verification) + std::cout << "do_verification = " << do_verification << std::endl; + + if(do_verification == 1) { + // CPU verification auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData; + + gpu_ref_kernel<<>>( + reinterpret_cast(in_device_ref_buf.GetDeviceBuffer()), + reinterpret_cast(wei_device_buf.GetDeviceBuffer()), + reinterpret_cast(out_device_buf.GetDeviceBuffer()), + dims); + + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + std::cout << "GPU reference kernel completed, copying results..." << std::endl; + + // Copy GPU reference result + Tensor in_gpu_ref(in_host.mDesc); + in_device_ref_buf.FromDevice(in_gpu_ref.mData.data()); + + // Copy optimized kernel result + in_device_buf.FromDevice(in_device.mData.data()); + + // Compare: Optimized kernel result vs GPU reference result + bool pass = ck::utils::check_err(in_device, + in_gpu_ref, + "Error: Incorrect results!", + get_rtol(), + get_atol()); + std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + + return pass ? 0 : 1; + } return 0; } diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp index 30a6e1eedc..3d1c70bea0 100644 --- a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp +++ b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp @@ -63,9 +63,9 @@ int main(int argc, char* argv[]) print_helper_msg(); - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; + int do_verification = 1; // 0=no, 1=CPU, 2=GPU + int init_method = 1; + bool time_kernel = false; ck::utils::conv::ConvParam conv_param{ 2, 1, 128, 256, 256, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; diff --git a/example/20_grouped_conv_bwd_weight/common.hpp b/example/20_grouped_conv_bwd_weight/common.hpp index 6cf9facbb4..e34cb9a071 100644 --- a/example/20_grouped_conv_bwd_weight/common.hpp +++ b/example/20_grouped_conv_bwd_weight/common.hpp @@ -19,6 +19,7 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp" using ::ck::DeviceMem; using ::ck::HostTensorDescriptor; @@ -38,6 +39,48 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdWeightDefault = ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default; +template +inline __host__ __device__ constexpr double get_rtol() +{ + if constexpr(std::is_same_v && std::is_same_v) + return 5e-3; + else if constexpr(std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 1e-6; + else if constexpr(std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 5e-2; + else if constexpr(std::is_same_v) + return 1e-1; + else if constexpr(std::is_same_v) + return 1.5e-1; + else + return 1e-3; +} + +template +inline __host__ __device__ constexpr double get_atol() +{ + if constexpr(std::is_same_v && std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 1e-6; + else if constexpr(std::is_same_v) + return 1e-3; + else if constexpr(std::is_same_v) + return 5e-2; + else if constexpr(std::is_same_v) + return 16.1; + else if constexpr(std::is_same_v) + return 16.1; + else + return 1e-3; +} + template struct CommonLayoutSetting { @@ -75,9 +118,9 @@ using OutputLayout = typename CommonLayoutSettingSelector::OutputLa struct ExecutionConfig final { - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; + int do_verification = 1; // 0=no, 1=CPU, 2=GPU + int init_method = 1; + bool time_kernel = false; }; #define DefaultConvParam \ diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc index cc763b5b0f..bc1a5edac6 100644 --- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc +++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc @@ -106,8 +106,11 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, invoker.Run(argument, StreamConfig{nullptr, false}); - if(config.do_verification) + std::cout << "do_verification = " << config.do_verification << std::endl; + + if(config.do_verification == 1) { + // CPU verification auto ref_conv = HostConvBwdWeightInstance{}; auto ref_invoker = ref_conv.MakeInvoker(); auto ref_argument = ref_conv.MakeArgument(in, @@ -130,6 +133,61 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData); } + else if(config.do_verification == 2) + { + // GPU verification (only supports G=1, standard convolution) + if(conv_param.G_ != 1) + { + std::cout << "GPU verification only supports G=1 (standard convolution)" << std::endl; + std::cout << "Current G=" << conv_param.G_ << " not supported." << std::endl; + std::cout << "Use do_verification=1 for CPU verification with grouped convolution." + << std::endl; + return true; + } + + std::cout << "Running GPU verification (G=1)..." << std::endl; + + DeviceMem wei_device_ref_buf(sizeof(WeiDataType) * + wei_device_result.mDesc.GetElementSpaceSize()); + wei_device_ref_buf.SetZero(); + + // Extract dimensions using helper function (G=1, standard convolution) + ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial, false); + + constexpr ck::index_t block_size = 256; + const ck::long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C; + const ck::index_t grid_size = (weight_length + block_size - 1) / block_size; + + auto gpu_ref_kernel = ck::ref::naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk; + + gpu_ref_kernel<<>>( + reinterpret_cast(in_device_buf.GetDeviceBuffer()), + reinterpret_cast(wei_device_ref_buf.GetDeviceBuffer()), + reinterpret_cast(out_device_buf.GetDeviceBuffer()), + dims); + + HIP_CHECK_ERROR(hipDeviceSynchronize()); + + std::cout << "GPU reference kernel completed, copying results..." << std::endl; + + wei_device_ref_buf.FromDevice(wei_host_result.mData.data()); + wei_device_buf.FromDevice(wei_device_result.mData.data()); + + bool pass = ck::utils::check_err(wei_device_result.mData, + wei_host_result.mData, + "Error: Incorrect results!", + get_rtol(), + get_atol()); + std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + + return pass; + } float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc index 7392074869..2772d20854 100644 --- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc +++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc @@ -1,6 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #pragma once + +#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp" + template ( + reinterpret_cast(input_ref_dev_buf.GetDeviceBuffer()), + reinterpret_cast(weight_dev_buf.GetDeviceBuffer()), + reinterpret_cast(output_dev_buf.GetDeviceBuffer()), + conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.C_, + conv_param.input_spatial_lengths_, + conv_param.filter_spatial_lengths_, + conv_param.output_spatial_lengths_, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_); + + // Copy GPU reference result to host for comparison + ck_tile::HostTensor input_gpu_ref(in_g_n_c_wis_desc); + input_ref_dev_buf.FromDevice(input_gpu_ref.data()); + + const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_); + const float max_accumulated_value = + *std::max_element(input_gpu_ref.mData.begin(), input_gpu_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, kbatch, max_accumulated_value); + pass = ck_tile::check_err(input, + input_gpu_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The GPU verification result is:" << (pass ? "correct" : "fail") << std::endl; } return pass; diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc index 528c82238c..2496a1b0d2 100644 --- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc +++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc @@ -1,6 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #pragma once + +#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp" + template ( + reinterpret_cast(input_dev_buf.GetDeviceBuffer()), + reinterpret_cast(weight_ref_dev_buf.GetDeviceBuffer()), + reinterpret_cast(output_dev_buf.GetDeviceBuffer()), + conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.C_, + conv_param.input_spatial_lengths_, + conv_param.filter_spatial_lengths_, + conv_param.output_spatial_lengths_, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_); + + // Copy GPU reference result to host for comparison + ck_tile::HostTensor weight_gpu_ref(wei_g_k_c_xs_desc); + weight_ref_dev_buf.FromDevice(weight_gpu_ref.data()); + + ck_tile::index_t GemmK = conv_param.N_; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + GemmK *= conv_param.output_spatial_lengths_[i]; + } + const float max_accumulated_value = + *std::max_element(weight_gpu_ref.mData.begin(), weight_gpu_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, kbatch, max_accumulated_value); + pass = ck_tile::check_err(weight, + weight_gpu_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The GPU verification result is:" << (pass ? "correct" : "fail") << std::endl; } return pass; diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_bias_clamp_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_bias_clamp_example.inc index 7aaa1f049c..ccbbddd3b3 100644 --- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_bias_clamp_example.inc +++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_bias_clamp_example.inc @@ -230,7 +230,11 @@ int run_grouped_conv_fwd_bias_clamp_example_with_layouts( } else if(arg_parser.get_int("v") == 2) { - throw std::runtime_error("Unsupported gpu verification !!!"); + // GPU verification for fused operation (Conv + Bias + Clamp) is complex + // For now, we only support GPU verification for basic convolution operations + // The bias+clamp fused variant can use CPU verification (-v=1) or no verification (-v=0) + throw std::runtime_error("GPU verification not yet supported for fused operations! Use " + "-v=1 for CPU verification."); } return pass; diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc index 26dada8e84..05a1a9bb62 100644 --- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc +++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc @@ -3,6 +3,8 @@ #pragma once +#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp" + template ( + reinterpret_cast(input_dev_buf.GetDeviceBuffer()), + reinterpret_cast(weight_dev_buf.GetDeviceBuffer()), + reinterpret_cast(output_ref_dev_buf.GetDeviceBuffer()), + conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.C_, + conv_param.input_spatial_lengths_, + conv_param.filter_spatial_lengths_, + conv_param.output_spatial_lengths_, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_); + + // Copy GPU reference result to host for comparison + ck_tile::HostTensor output_gpu_ref(out_g_n_k_wos_desc); + output_ref_dev_buf.FromDevice(output_gpu_ref.data()); + + const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_); + const float max_accumulated_value = + *std::max_element(output_gpu_ref.mData.begin(), output_gpu_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, kbatch, max_accumulated_value); + pass = ck_tile::check_err(output, + output_gpu_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The GPU verification result is:" << (pass ? "correct" : "fail") << std::endl; } return pass; diff --git a/example/test_old_ck_gpu_reference.cpp b/example/test_old_ck_gpu_reference.cpp new file mode 100644 index 0000000000..0bcf43d20b --- /dev/null +++ b/example/test_old_ck_gpu_reference.cpp @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +// Standalone test program for Old CK GPU references +// Tests naive_conv_fwd (existing) and future backward ops + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" + +// CPU reference for validation +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" + +// GPU reference (OLD CK - already exists!) +#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" + +using namespace ck; + +template +struct ConvParams +{ + index_t N, K, C; + std::vector input_spatial; + std::vector filter_spatial; + std::vector output_spatial; + std::vector strides; + std::vector dilations; + std::vector pads; +}; + +template +bool test_conv_forward_gpu_ref(const ConvParams& params, const std::string& test_name) +{ + std::cout << "[TEST] " << test_name << std::endl; + + // Calculate dimensions + const index_t N = params.N; + const index_t K = params.K; + const index_t C = params.C; + + // Create tensor descriptors (NDHWC layout for old CK) + std::vector in_lengths = {N}; + for(auto d : params.input_spatial) + in_lengths.push_back(d); + in_lengths.push_back(C); + + std::vector wei_lengths = {K}; + for(auto d : params.filter_spatial) + wei_lengths.push_back(d); + wei_lengths.push_back(C); + + std::vector out_lengths = {N}; + for(auto d : params.output_spatial) + out_lengths.push_back(d); + out_lengths.push_back(K); + + // Create host tensors + Tensor input(in_lengths); + Tensor weight(wei_lengths); + Tensor output_gpu(out_lengths); + Tensor output_ref(out_lengths); + + // Initialize with random data + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weight.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + + // Allocate device memory + DeviceMem input_dev(input.mData.size() * sizeof(InDataType)); + DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType)); + DeviceMem output_dev(output_gpu.mData.size() * sizeof(OutDataType)); + + // Copy to device + input_dev.ToDevice(input.mData.data()); + weight_dev.ToDevice(weight.mData.data()); + + // Run CPU reference for validation + auto ref_conv = + tensor_operation::host::ReferenceConvFwd(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_arg = ref_conv.MakeArgument(input.mData.data(), + weight.mData.data(), + output_ref.mData.data(), + N, + K, + C, + params.input_spatial, + params.filter_spatial, + params.output_spatial, + params.strides, + params.dilations, + params.pads, + params.pads, + {}, + {}, + {}); + + ref_invoker.Run(ref_arg); + + // Run GPU reference (OLD CK) + using InElementOp = tensor_operation::element_wise::PassThrough; + using WeiElementOp = tensor_operation::element_wise::PassThrough; + using OutElementOp = tensor_operation::element_wise::PassThrough; + + constexpr index_t block_size = 256; + + // Extract dimensions based on NDimSpatial + index_t Di = 1, Hi = 1, Wi = 1; + index_t Z = 1, Y = 1, X = 1; + index_t Do = 1, Ho = 1, Wo = 1; + index_t stride_z = 1, stride_y = 1, stride_x = 1; + index_t dilation_z = 1, dilation_y = 1, dilation_x = 1; + index_t pad_z = 0, pad_y = 0, pad_x = 0; + + if(NDimSpatial == 1) + { + Wi = params.input_spatial[0]; + X = params.filter_spatial[0]; + Wo = params.output_spatial[0]; + stride_x = params.strides[0]; + dilation_x = params.dilations[0]; + pad_x = params.pads[0]; + } + else if(NDimSpatial == 2) + { + Hi = params.input_spatial[0]; + Wi = params.input_spatial[1]; + Y = params.filter_spatial[0]; + X = params.filter_spatial[1]; + Ho = params.output_spatial[0]; + Wo = params.output_spatial[1]; + stride_y = params.strides[0]; + stride_x = params.strides[1]; + dilation_y = params.dilations[0]; + dilation_x = params.dilations[1]; + pad_y = params.pads[0]; + pad_x = params.pads[1]; + } + else if(NDimSpatial == 3) + { + Di = params.input_spatial[0]; + Hi = params.input_spatial[1]; + Wi = params.input_spatial[2]; + Z = params.filter_spatial[0]; + Y = params.filter_spatial[1]; + X = params.filter_spatial[2]; + Do = params.output_spatial[0]; + Ho = params.output_spatial[1]; + Wo = params.output_spatial[2]; + stride_z = params.strides[0]; + stride_y = params.strides[1]; + stride_x = params.strides[2]; + dilation_z = params.dilations[0]; + dilation_y = params.dilations[1]; + dilation_x = params.dilations[2]; + pad_z = params.pads[0]; + pad_y = params.pads[1]; + pad_x = params.pads[2]; + } + + // Launch GPU reference kernel + const long_index_t output_length = N * Do * Ho * Wo * K; + const index_t grid_size = (output_length + block_size - 1) / block_size; + + hipLaunchKernelGGL(ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk, + dim3(grid_size), + dim3(block_size), + 0, + nullptr, + reinterpret_cast(input_dev.GetDeviceBuffer()), + reinterpret_cast(weight_dev.GetDeviceBuffer()), + reinterpret_cast(output_dev.GetDeviceBuffer()), + N, + K, + C, + Di, + Hi, + Wi, + Z, + Y, + X, + Do, + Ho, + Wo, + stride_z, + stride_y, + stride_x, + dilation_z, + dilation_y, + dilation_x, + pad_z, + pad_y, + pad_x); + + hipDeviceSynchronize(); + + // Copy result back + output_dev.FromDevice(output_gpu.mData.data()); + + // Compare GPU ref vs CPU ref + bool pass = check_err(output_gpu.mData, output_ref.mData, "GPU vs CPU ref", 1e-3, 1e-3); + + std::cout << " Result: " << (pass ? "✅ PASS" : "❌ FAIL") << std::endl; + + return pass; +} + +int main(int argc, char* argv[]) +{ + std::cout << "========================================" << std::endl; + std::cout << "Old CK GPU Reference Test Program" << std::endl; + std::cout << "========================================" << std::endl; + std::cout << std::endl; + + int passed = 0; + int failed = 0; + + // Test 1: 2D Conv, FP16, Small + { + ConvParams<2> params; + params.N = 2; + params.K = 8; + params.C = 8; + params.input_spatial = {7, 7}; + params.filter_spatial = {3, 3}; + params.output_spatial = {5, 5}; + params.strides = {1, 1}; + params.dilations = {1, 1}; + params.pads = {0, 0}; + + if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Small")) + passed++; + else + failed++; + } + + // Test 2: 2D Conv, FP32, Medium + { + ConvParams<2> params; + params.N = 4; + params.K = 16; + params.C = 16; + params.input_spatial = {14, 14}; + params.filter_spatial = {3, 3}; + params.output_spatial = {12, 12}; + params.strides = {1, 1}; + params.dilations = {1, 1}; + params.pads = {0, 0}; + + if(test_conv_forward_gpu_ref<2, float, float, float>(params, "2D-FP32-Medium")) + passed++; + else + failed++; + } + + // Test 3: 1D Conv, FP16 + { + ConvParams<1> params; + params.N = 2; + params.K = 8; + params.C = 8; + params.input_spatial = {16}; + params.filter_spatial = {3}; + params.output_spatial = {14}; + params.strides = {1}; + params.dilations = {1}; + params.pads = {0}; + + if(test_conv_forward_gpu_ref<1, half_t, half_t, half_t>(params, "1D-FP16")) + passed++; + else + failed++; + } + + // Test 4: 3D Conv, FP16, Small + { + ConvParams<3> params; + params.N = 1; + params.K = 8; + params.C = 8; + params.input_spatial = {5, 5, 5}; + params.filter_spatial = {3, 3, 3}; + params.output_spatial = {3, 3, 3}; + params.strides = {1, 1, 1}; + params.dilations = {1, 1, 1}; + params.pads = {0, 0, 0}; + + if(test_conv_forward_gpu_ref<3, half_t, half_t, half_t>(params, "3D-FP16-Small")) + passed++; + else + failed++; + } + + // Test 5: 2D Conv with stride + { + ConvParams<2> params; + params.N = 2; + params.K = 8; + params.C = 8; + params.input_spatial = {8, 8}; + params.filter_spatial = {3, 3}; + params.output_spatial = {3, 3}; + params.strides = {2, 2}; + params.dilations = {1, 1}; + params.pads = {0, 0}; + + if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Stride2")) + passed++; + else + failed++; + } + + std::cout << std::endl; + std::cout << "========================================" << std::endl; + std::cout << "SUMMARY" << std::endl; + std::cout << "========================================" << std::endl; + std::cout << "Total: " << (passed + failed) << std::endl; + std::cout << "Passed: " << passed << " ✅" << std::endl; + std::cout << "Failed: " << failed << std::endl; + std::cout << std::endl; + + if(failed == 0) + { + std::cout << "🎉 ALL TESTS PASSED!" << std::endl; + std::cout << "Old CK Forward GPU Reference: WORKING ✅" << std::endl; + return 0; + } + else + { + std::cout << "❌ SOME TESTS FAILED" << std::endl; + return 1; + } +} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp index 0388da57d9..2cdb70e2a2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp @@ -11,7 +11,7 @@ #include "device.hpp" #include "device_conv_fwd.hpp" #include "common_header.hpp" -#include "naive_conv_fwd.hpp" +#include "naive_conv_fwd_gpu.hpp" namespace ck { namespace tensor_operation { diff --git a/include/ck_tile/ref/conv_common.hpp b/include/ck_tile/ref/conv_common.hpp new file mode 100644 index 0000000000..ed43e87b14 --- /dev/null +++ b/include/ck_tile/ref/conv_common.hpp @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include +#include + +namespace ck_tile { + +// Helper function to convert std::vector to std::array for kernel parameters +template +inline std::array +to_array(const std::vector& vec) +{ + std::array arr; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + arr[i] = vec[i]; + } + return arr; +} + +// Helper to fill missing dimensions with default value +template +inline std::array +to_array_with_default(const std::vector& vec, + ck_tile::long_index_t default_val = 1) +{ + std::array arr; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + arr[i] = (static_cast(i) < vec.size()) ? vec[i] : default_val; + } + return arr; +} + +// Index calculation helpers for GPU reference kernels +namespace detail { + +// Calculate linear input index for grouped convolution +// Layout: [N, spatial..., G, C] +template +inline __device__ long_index_t +calculate_input_index(index_t n, + index_t g, + index_t c, + const std::array& spatial_idx, + const std::array& strides) +{ + long_index_t idx = n * strides[0]; + for(index_t i = 0; i < NDimSpatial; ++i) + idx += spatial_idx[i] * strides[i + 1]; + idx += g * strides[NDimSpatial + 1] + c; + return idx; +} + +// Calculate linear weight index for grouped convolution +// Layout: [G, K, spatial..., C] +template +inline __device__ long_index_t +calculate_weight_index(index_t g, + index_t k, + index_t c, + const std::array& spatial_idx, + const std::array& strides) +{ + long_index_t idx = g * strides[0] + k * strides[1]; + for(index_t i = 0; i < NDimSpatial; ++i) + idx += spatial_idx[i] * strides[i + 2]; + idx += c * strides[NDimSpatial + 2]; + return idx; +} + +// Calculate linear output index for grouped convolution +// Layout: [N, spatial..., G, K] +template +inline __device__ long_index_t +calculate_output_index(index_t n, + index_t g, + index_t k, + const std::array& spatial_idx, + const std::array& strides) +{ + long_index_t idx = n * strides[0]; + for(index_t i = 0; i < NDimSpatial; ++i) + idx += spatial_idx[i] * strides[i + 1]; + idx += g * strides[NDimSpatial + 1] + k; + return idx; +} + +} // namespace detail + +} // namespace ck_tile diff --git a/include/ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp b/include/ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp new file mode 100644 index 0000000000..a5f6a697f2 --- /dev/null +++ b/include/ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ref/conv_common.hpp" +#include +#include "ck_tile/host/device_memory.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include + +namespace ck_tile { + +// Naive GPU reference kernel struct for backward data grouped convolution +// Computes gradient with respect to input +// Layout: Input_grad=NDHWGC, Weight=GKZYXC, Output_grad=NDHWGK (for 3D case) +// Input_grad=NHWGC, Weight=GKYXC, Output_grad=NHWGK (for 2D case) +// Input_grad=NWGC, Weight=GKXC, Output_grad=NWGK (for 1D case) +// +// One thread per input element, uses grid-stride loop pattern + +template +struct naive_grouped_conv_bwd_data_kernel +{ + static constexpr ck_tile::index_t kBlockSize = 256; + + __device__ void + operator()(InDataType* __restrict__ p_in_grad, + const WeiDataType* __restrict__ p_wei, + const OutDataType* __restrict__ p_out_grad, + // Tensor dimensions + ck_tile::index_t G, // number of groups + ck_tile::index_t N, // batch size + ck_tile::index_t K, // output channels per group + ck_tile::index_t C, // input channels per group + // Input spatial dimensions + const std::array& in_spatial_lengths, + // Weight spatial dimensions + const std::array& wei_spatial_lengths, + // Output spatial dimensions + const std::array& out_spatial_lengths, + // Convolution parameters + const std::array& conv_strides, + const std::array& conv_dilations, + const std::array& in_left_pads) const + { + const ck_tile::long_index_t tid = get_block_id() * blockDim.x + get_thread_id(); + const ck_tile::long_index_t num_threads = blockDim.x * gridDim.x; + + // Calculate total input elements + ck_tile::long_index_t input_length = G * N * C; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + input_length *= in_spatial_lengths[i]; + } + + // Calculate strides for input tensor (NDHWGC or NHWGC or NWGC) + std::array in_strides; + ck_tile::long_index_t stride = 1; + in_strides[NDimSpatial + 2] = stride; // C stride + stride *= C; + in_strides[NDimSpatial + 1] = stride; // G stride + stride *= G; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) + { + in_strides[i + 1] = stride; + stride *= in_spatial_lengths[i]; + } + in_strides[0] = stride; // N stride + + // Calculate strides for output tensor (NDHWGK or NHWGK or NWGK) + std::array out_strides; + stride = 1; + out_strides[NDimSpatial + 2] = stride; // K stride + stride *= K; + out_strides[NDimSpatial + 1] = stride; // G stride + stride *= G; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) + { + out_strides[i + 1] = stride; + stride *= out_spatial_lengths[i]; + } + out_strides[0] = stride; // N stride + + // Calculate strides for weight tensor (GKZYXC or GKYXC or GKXC) + std::array wei_strides; + stride = 1; + wei_strides[NDimSpatial + 2] = stride; // C stride + stride *= C; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) + { + wei_strides[i + 2] = stride; + stride *= wei_spatial_lengths[i]; + } + wei_strides[1] = stride; // K stride + stride *= K; + wei_strides[0] = stride; // G stride + + // Grid-stride loop over all input elements + for(ck_tile::long_index_t ii = tid; ii < input_length; ii += num_threads) + { + // Decode linear index to multi-dimensional indices + ck_tile::long_index_t tmp = ii; + + // Extract N (batch) + ck_tile::index_t n = tmp / in_strides[0]; + tmp -= n * in_strides[0]; + + // Extract spatial dimensions + ck_tile::index_t in_spatial_idx[6]; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + in_spatial_idx[i] = tmp / in_strides[i + 1]; + tmp -= in_spatial_idx[i] * in_strides[i + 1]; + } + + // Extract G (group) + ck_tile::index_t g = tmp / in_strides[NDimSpatial + 1]; + tmp -= g * in_strides[NDimSpatial + 1]; + + // Extract C (input channel) + ck_tile::index_t c = tmp; + + // Accumulate in float + float v_acc = 0.0f; + + // Loop over output channels + for(ck_tile::index_t k = 0; k < K; ++k) + { + // Loop over filter spatial dimensions + if constexpr(NDimSpatial == 1) + { + for(ck_tile::index_t x = 0; x < wei_spatial_lengths[0]; ++x) + { + // Calculate output spatial coordinate (inverse of forward) + ck_tile::long_index_t w_tmp = + static_cast(in_spatial_idx[0]) + + static_cast(in_left_pads[0]) - + static_cast(x * conv_dilations[0]); + + // Check if this maps to valid output position + if(w_tmp % conv_strides[0] == 0) + { + ck_tile::long_index_t wo = w_tmp / conv_strides[0]; + + if(wo >= 0 && wo < out_spatial_lengths[0]) + { + std::array out_spatial = { + static_cast(wo)}; + std::array wei_spatial = {x}; + ck_tile::long_index_t out_idx = detail::calculate_output_index<1>( + n, g, k, out_spatial, out_strides); + ck_tile::long_index_t wei_idx = detail::calculate_weight_index<1>( + g, k, c, wei_spatial, wei_strides); + + v_acc += type_convert(p_out_grad[out_idx]) * + type_convert(p_wei[wei_idx]); + } + } + } + } + else if constexpr(NDimSpatial == 2) + { + for(ck_tile::index_t y = 0; y < wei_spatial_lengths[0]; ++y) + { + ck_tile::long_index_t h_tmp = + static_cast(in_spatial_idx[0]) + + static_cast(in_left_pads[0]) - + static_cast(y * conv_dilations[0]); + + if(h_tmp % conv_strides[0] == 0) + { + ck_tile::long_index_t ho = h_tmp / conv_strides[0]; + + if(ho >= 0 && ho < out_spatial_lengths[0]) + { + for(ck_tile::index_t x = 0; x < wei_spatial_lengths[1]; ++x) + { + ck_tile::long_index_t w_tmp = + static_cast(in_spatial_idx[1]) + + static_cast(in_left_pads[1]) - + static_cast(x * conv_dilations[1]); + + if(w_tmp % conv_strides[1] == 0) + { + ck_tile::long_index_t wo = w_tmp / conv_strides[1]; + + if(wo >= 0 && wo < out_spatial_lengths[1]) + { + std::array out_spatial = { + static_cast(ho), static_cast(wo)}; + std::array wei_spatial = {y, x}; + ck_tile::long_index_t out_idx = + detail::calculate_output_index<2>( + n, g, k, out_spatial, out_strides); + ck_tile::long_index_t wei_idx = + detail::calculate_weight_index<2>( + g, k, c, wei_spatial, wei_strides); + + v_acc += type_convert(p_out_grad[out_idx]) * + type_convert(p_wei[wei_idx]); + } + } + } + } + } + } + } + else if constexpr(NDimSpatial == 3) + { + for(ck_tile::index_t z = 0; z < wei_spatial_lengths[0]; ++z) + { + ck_tile::long_index_t d_tmp = + static_cast(in_spatial_idx[0]) + + static_cast(in_left_pads[0]) - + static_cast(z * conv_dilations[0]); + + if(d_tmp % conv_strides[0] == 0) + { + ck_tile::long_index_t do_ = d_tmp / conv_strides[0]; + + if(do_ >= 0 && do_ < out_spatial_lengths[0]) + { + for(ck_tile::index_t y = 0; y < wei_spatial_lengths[1]; ++y) + { + ck_tile::long_index_t h_tmp = + static_cast(in_spatial_idx[1]) + + static_cast(in_left_pads[1]) - + static_cast(y * conv_dilations[1]); + + if(h_tmp % conv_strides[1] == 0) + { + ck_tile::long_index_t ho = h_tmp / conv_strides[1]; + + if(ho >= 0 && ho < out_spatial_lengths[1]) + { + for(ck_tile::index_t x = 0; x < wei_spatial_lengths[2]; + ++x) + { + ck_tile::long_index_t w_tmp = + static_cast( + in_spatial_idx[2]) + + static_cast( + in_left_pads[2]) - + static_cast( + x * conv_dilations[2]); + + if(w_tmp % conv_strides[2] == 0) + { + ck_tile::long_index_t wo = + w_tmp / conv_strides[2]; + + if(wo >= 0 && wo < out_spatial_lengths[2]) + { + std::array + out_spatial = { + static_cast(do_), + static_cast(ho), + static_cast(wo)}; + std::array + wei_spatial = {z, y, x}; + ck_tile::long_index_t out_idx = + detail::calculate_output_index<3>( + n, g, k, out_spatial, out_strides); + ck_tile::long_index_t wei_idx = + detail::calculate_weight_index<3>( + g, k, c, wei_spatial, wei_strides); + + v_acc += + type_convert( + p_out_grad[out_idx]) * + type_convert(p_wei[wei_idx]); + } + } + } + } + } + } + } + } + } + } + } + + // Convert accumulator to output type and write + p_in_grad[ii] = type_convert(v_acc); + } + } +}; + +// Host-side launcher for naive grouped convolution backward data +template +CK_TILE_HOST float +naive_grouped_conv_bwd_data(InDataType* p_in_grad_dev, + const WeiDataType* p_wei_dev, + const OutDataType* p_out_grad_dev, + ck_tile::index_t G, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t C, + std::vector in_spatial_lengths, + std::vector wei_spatial_lengths, + std::vector out_spatial_lengths, + std::vector conv_strides, + std::vector conv_dilations, + std::vector in_left_pads, + ck_tile::stream_config stream_config = {}) +{ + // Convert vectors to arrays + auto in_spatial_arr = to_array_with_default(in_spatial_lengths); + auto wei_spatial_arr = to_array_with_default(wei_spatial_lengths); + auto out_spatial_arr = to_array_with_default(out_spatial_lengths); + auto conv_strides_arr = to_array_with_default(conv_strides); + auto conv_dilations_arr = to_array_with_default(conv_dilations); + auto in_left_pads_arr = to_array_with_default(in_left_pads, 0); + + // Calculate grid size + ck_tile::long_index_t input_length = G * N * C; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + input_length *= in_spatial_lengths[i]; + } + + using KernelType = + naive_grouped_conv_bwd_data_kernel; + + constexpr ck_tile::index_t block_size = KernelType::kBlockSize; + const ck_tile::index_t grid_size = (input_length + block_size - 1) / block_size; + + // Launch kernel + float elapsed_ms = launch_kernel(stream_config, + make_kernel(KernelType{}, + dim3(grid_size), + dim3(block_size), + 0, // dynamic shared memory size + p_in_grad_dev, + p_wei_dev, + p_out_grad_dev, + G, + N, + K, + C, + in_spatial_arr, + wei_spatial_arr, + out_spatial_arr, + conv_strides_arr, + conv_dilations_arr, + in_left_pads_arr)); + + return elapsed_ms; +} + +} // namespace ck_tile diff --git a/include/ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp b/include/ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp new file mode 100644 index 0000000000..2ac9c19892 --- /dev/null +++ b/include/ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ref/conv_common.hpp" +#include +#include "ck_tile/host/device_memory.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include + +namespace ck_tile { + +// Naive GPU reference kernel struct for backward weight grouped convolution +// Computes gradient with respect to weights +// Layout: Input=NDHWGC, Weight_grad=GKZYXC, Output_grad=NDHWGK (for 3D case) +// Input=NHWGC, Weight_grad=GKYXC, Output_grad=NHWGK (for 2D case) +// Input=NWGC, Weight_grad=GKXC, Output_grad=NWGK (for 1D case) +// +// One thread per weight element, uses grid-stride loop pattern + +template +struct naive_grouped_conv_bwd_weight_kernel +{ + static constexpr ck_tile::index_t kBlockSize = 256; + + __device__ void + operator()(const InDataType* __restrict__ p_in, + WeiDataType* __restrict__ p_wei_grad, + const OutDataType* __restrict__ p_out_grad, + // Tensor dimensions + ck_tile::index_t G, // number of groups + ck_tile::index_t N, // batch size + ck_tile::index_t K, // output channels per group + ck_tile::index_t C, // input channels per group + // Input spatial dimensions + const std::array& in_spatial_lengths, + // Weight spatial dimensions + const std::array& wei_spatial_lengths, + // Output spatial dimensions + const std::array& out_spatial_lengths, + // Convolution parameters + const std::array& conv_strides, + const std::array& conv_dilations, + const std::array& in_left_pads) const + { + const ck_tile::long_index_t tid = get_block_id() * blockDim.x + get_thread_id(); + const ck_tile::long_index_t num_threads = blockDim.x * gridDim.x; + + // Calculate total weight elements + ck_tile::long_index_t weight_length = G * K * C; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + weight_length *= wei_spatial_lengths[i]; + } + + // Calculate strides for weight tensor (GKZYXC or GKYXC or GKXC) + std::array wei_strides; + ck_tile::long_index_t stride = 1; + wei_strides[NDimSpatial + 2] = stride; // C stride + stride *= C; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) + { + wei_strides[i + 2] = stride; + stride *= wei_spatial_lengths[i]; + } + wei_strides[1] = stride; // K stride + stride *= K; + wei_strides[0] = stride; // G stride + + // Calculate strides for input tensor (NDHWGC or NHWGC or NWGC) + std::array in_strides; + stride = 1; + in_strides[NDimSpatial + 2] = stride; // C stride + stride *= C; + in_strides[NDimSpatial + 1] = stride; // G stride + stride *= G; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) + { + in_strides[i + 1] = stride; + stride *= in_spatial_lengths[i]; + } + in_strides[0] = stride; // N stride + + // Calculate strides for output tensor (NDHWGK or NHWGK or NWGK) + std::array out_strides; + stride = 1; + out_strides[NDimSpatial + 2] = stride; // K stride + stride *= K; + out_strides[NDimSpatial + 1] = stride; // G stride + stride *= G; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) + { + out_strides[i + 1] = stride; + stride *= out_spatial_lengths[i]; + } + out_strides[0] = stride; // N stride + + // Grid-stride loop over all weight elements + for(ck_tile::long_index_t ii = tid; ii < weight_length; ii += num_threads) + { + // Decode linear index to multi-dimensional indices + ck_tile::long_index_t tmp = ii; + + // Extract G (group) + ck_tile::index_t g = tmp / wei_strides[0]; + tmp -= g * wei_strides[0]; + + // Extract K (output channel) + ck_tile::index_t k = tmp / wei_strides[1]; + tmp -= k * wei_strides[1]; + + // Extract spatial dimensions (come before C in GKZYXC layout) + ck_tile::index_t wei_spatial_idx[6]; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + wei_spatial_idx[i] = tmp / wei_strides[i + 2]; + tmp -= wei_spatial_idx[i] * wei_strides[i + 2]; + } + + // Extract C (input channel) - comes last + ck_tile::index_t c = tmp; + + // Accumulate in float + float v_acc = 0.0f; + + // Loop over batch + for(ck_tile::index_t n = 0; n < N; ++n) + { + // Loop over output spatial dimensions + if constexpr(NDimSpatial == 1) + { + for(ck_tile::index_t wo = 0; wo < out_spatial_lengths[0]; ++wo) + { + // Calculate input spatial coordinate + ck_tile::long_index_t wi = + static_cast(wo * conv_strides[0]) + + static_cast(wei_spatial_idx[0] * + conv_dilations[0]) - + static_cast(in_left_pads[0]); + + // Bounds check + if(wi >= 0 && wi < in_spatial_lengths[0]) + { + std::array in_spatial = {static_cast(wi)}; + std::array out_spatial = { + static_cast(wo)}; + ck_tile::long_index_t in_idx = + detail::calculate_input_index<1>(n, g, c, in_spatial, in_strides); + ck_tile::long_index_t out_idx = detail::calculate_output_index<1>( + n, g, k, out_spatial, out_strides); + + v_acc += type_convert(p_out_grad[out_idx]) * + type_convert(p_in[in_idx]); + } + } + } + else if constexpr(NDimSpatial == 2) + { + for(ck_tile::index_t ho = 0; ho < out_spatial_lengths[0]; ++ho) + { + ck_tile::long_index_t hi = + static_cast(ho * conv_strides[0]) + + static_cast(wei_spatial_idx[0] * + conv_dilations[0]) - + static_cast(in_left_pads[0]); + + for(ck_tile::index_t wo = 0; wo < out_spatial_lengths[1]; ++wo) + { + ck_tile::long_index_t wi = + static_cast(wo * conv_strides[1]) + + static_cast(wei_spatial_idx[1] * + conv_dilations[1]) - + static_cast(in_left_pads[1]); + + // Bounds check + if(hi >= 0 && hi < in_spatial_lengths[0] && wi >= 0 && + wi < in_spatial_lengths[1]) + { + std::array in_spatial = { + static_cast(hi), static_cast(wi)}; + std::array out_spatial = { + static_cast(ho), static_cast(wo)}; + ck_tile::long_index_t in_idx = detail::calculate_input_index<2>( + n, g, c, in_spatial, in_strides); + ck_tile::long_index_t out_idx = detail::calculate_output_index<2>( + n, g, k, out_spatial, out_strides); + + v_acc += type_convert(p_out_grad[out_idx]) * + type_convert(p_in[in_idx]); + } + } + } + } + else if constexpr(NDimSpatial == 3) + { + for(ck_tile::index_t do_ = 0; do_ < out_spatial_lengths[0]; ++do_) + { + ck_tile::long_index_t di = + static_cast(do_ * conv_strides[0]) + + static_cast(wei_spatial_idx[0] * + conv_dilations[0]) - + static_cast(in_left_pads[0]); + + for(ck_tile::index_t ho = 0; ho < out_spatial_lengths[1]; ++ho) + { + ck_tile::long_index_t hi = + static_cast(ho * conv_strides[1]) + + static_cast(wei_spatial_idx[1] * + conv_dilations[1]) - + static_cast(in_left_pads[1]); + + for(ck_tile::index_t wo = 0; wo < out_spatial_lengths[2]; ++wo) + { + ck_tile::long_index_t wi = + static_cast(wo * conv_strides[2]) + + static_cast(wei_spatial_idx[2] * + conv_dilations[2]) - + static_cast(in_left_pads[2]); + + // Bounds check + if(di >= 0 && di < in_spatial_lengths[0] && hi >= 0 && + hi < in_spatial_lengths[1] && wi >= 0 && + wi < in_spatial_lengths[2]) + { + std::array in_spatial = { + static_cast(di), + static_cast(hi), + static_cast(wi)}; + std::array out_spatial = { + static_cast(do_), + static_cast(ho), + static_cast(wo)}; + ck_tile::long_index_t in_idx = detail::calculate_input_index<3>( + n, g, c, in_spatial, in_strides); + ck_tile::long_index_t out_idx = + detail::calculate_output_index<3>( + n, g, k, out_spatial, out_strides); + + v_acc += type_convert(p_out_grad[out_idx]) * + type_convert(p_in[in_idx]); + } + } + } + } + } + } + + // Convert accumulator to output type and write + p_wei_grad[ii] = type_convert(v_acc); + } + } +}; + +// Host-side launcher for naive grouped convolution backward weight +template +CK_TILE_HOST float +naive_grouped_conv_bwd_weight(const InDataType* p_in_dev, + WeiDataType* p_wei_grad_dev, + const OutDataType* p_out_grad_dev, + ck_tile::index_t G, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t C, + std::vector in_spatial_lengths, + std::vector wei_spatial_lengths, + std::vector out_spatial_lengths, + std::vector conv_strides, + std::vector conv_dilations, + std::vector in_left_pads, + ck_tile::stream_config stream_config = {}) +{ + // Convert vectors to arrays + auto in_spatial_arr = to_array_with_default(in_spatial_lengths); + auto wei_spatial_arr = to_array_with_default(wei_spatial_lengths); + auto out_spatial_arr = to_array_with_default(out_spatial_lengths); + auto conv_strides_arr = to_array_with_default(conv_strides); + auto conv_dilations_arr = to_array_with_default(conv_dilations); + auto in_left_pads_arr = to_array_with_default(in_left_pads, 0); + + // Calculate grid size + ck_tile::long_index_t weight_length = G * K * C; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + weight_length *= wei_spatial_lengths[i]; + } + + using KernelType = + naive_grouped_conv_bwd_weight_kernel; + + constexpr ck_tile::index_t block_size = KernelType::kBlockSize; + const ck_tile::index_t grid_size = (weight_length + block_size - 1) / block_size; + + // Launch kernel + float elapsed_ms = launch_kernel(stream_config, + make_kernel(KernelType{}, + dim3(grid_size), + dim3(block_size), + 0, // dynamic shared memory size + p_in_dev, + p_wei_grad_dev, + p_out_grad_dev, + G, + N, + K, + C, + in_spatial_arr, + wei_spatial_arr, + out_spatial_arr, + conv_strides_arr, + conv_dilations_arr, + in_left_pads_arr)); + + return elapsed_ms; +} + +} // namespace ck_tile diff --git a/include/ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp b/include/ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp new file mode 100644 index 0000000000..720fa40297 --- /dev/null +++ b/include/ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ref/conv_common.hpp" +#include +#include "ck_tile/host/device_memory.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include + +namespace ck_tile { + +// Naive GPU reference kernel struct for forward grouped convolution +// Layout: Input=NDHWGC, Weight=GKZYXC, Output=NDHWGK (for 3D case) +// Input=NHWGC, Weight=GKYXC, Output=NHWGK (for 2D case) +// Input=NWGC, Weight=GKXC, Output=NWGK (for 1D case) +// +// One thread per output element, uses grid-stride loop pattern + +template +struct naive_grouped_conv_fwd_kernel +{ + static constexpr ck_tile::index_t kBlockSize = 256; + + __device__ void + operator()(const InDataType* __restrict__ p_in, + const WeiDataType* __restrict__ p_wei, + OutDataType* __restrict__ p_out, + // Tensor dimensions + ck_tile::index_t G, // number of groups + ck_tile::index_t N, // batch size + ck_tile::index_t K, // output channels per group + ck_tile::index_t C, // input channels per group + // Input spatial dimensions + const std::array& in_spatial_lengths, + // Weight spatial dimensions + const std::array& wei_spatial_lengths, + // Output spatial dimensions + const std::array& out_spatial_lengths, + // Convolution parameters + const std::array& conv_strides, + const std::array& conv_dilations, + const std::array& in_left_pads) const + { + const ck_tile::long_index_t tid = get_block_id() * blockDim.x + get_thread_id(); + const ck_tile::long_index_t num_threads = blockDim.x * gridDim.x; + + // Calculate total output elements + ck_tile::long_index_t output_length = G * N * K; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + output_length *= out_spatial_lengths[i]; + } + + // Calculate strides for output tensor (NDHWGK or NHWGK or NWGK) + std::array out_strides; // N, spatial dims, G, K + ck_tile::long_index_t stride = 1; + out_strides[NDimSpatial + 2] = stride; // K stride + stride *= K; + out_strides[NDimSpatial + 1] = stride; // G stride + stride *= G; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) // Spatial strides (reversed) + { + out_strides[i + 1] = stride; + stride *= out_spatial_lengths[i]; + } + out_strides[0] = stride; // N stride + + // Calculate strides for input tensor (NDHWGC or NHWGC or NWGC) + std::array in_strides; + stride = 1; + in_strides[NDimSpatial + 2] = stride; // C stride + stride *= C; + in_strides[NDimSpatial + 1] = stride; // G stride + stride *= G; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) + { + in_strides[i + 1] = stride; + stride *= in_spatial_lengths[i]; + } + in_strides[0] = stride; // N stride + + // Calculate strides for weight tensor (GKZYXC or GKYXC or GKXC) + std::array wei_strides; + stride = 1; + wei_strides[NDimSpatial + 2] = stride; // C stride + stride *= C; + for(ck_tile::index_t i = NDimSpatial - 1; i >= 0; --i) + { + wei_strides[i + 2] = stride; + stride *= wei_spatial_lengths[i]; + } + wei_strides[1] = stride; // K stride + stride *= K; + wei_strides[0] = stride; // G stride + + // Grid-stride loop over all output elements + for(ck_tile::long_index_t ii = tid; ii < output_length; ii += num_threads) + { + // Decode linear index to multi-dimensional indices + ck_tile::long_index_t tmp = ii; + + // Extract N (batch) + ck_tile::index_t n = tmp / out_strides[0]; + tmp -= n * out_strides[0]; + + // Extract spatial dimensions (D, H, W) + ck_tile::index_t out_spatial_idx[6]; // Max 6 spatial dimensions + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + out_spatial_idx[i] = tmp / out_strides[i + 1]; + tmp -= out_spatial_idx[i] * out_strides[i + 1]; + } + + // Extract G (group) + ck_tile::index_t g = tmp / out_strides[NDimSpatial + 1]; + tmp -= g * out_strides[NDimSpatial + 1]; + + // Extract K (output channel) + ck_tile::index_t k = tmp; + + // Accumulate in float + float v_acc = 0.0f; + + // Loop over input channels + for(ck_tile::index_t c = 0; c < C; ++c) + { + // Loop over filter spatial dimensions + if constexpr(NDimSpatial == 1) + { + for(ck_tile::index_t x = 0; x < wei_spatial_lengths[0]; ++x) + { + // Calculate input spatial coordinate + ck_tile::long_index_t wi = + static_cast(out_spatial_idx[0] * + conv_strides[0]) + + static_cast(x * conv_dilations[0]) - + static_cast(in_left_pads[0]); + + // Bounds check + if(wi >= 0 && wi < in_spatial_lengths[0]) + { + std::array in_spatial = {static_cast(wi)}; + std::array wei_spatial = {x}; + ck_tile::long_index_t in_idx = + detail::calculate_input_index<1>(n, g, c, in_spatial, in_strides); + ck_tile::long_index_t wei_idx = detail::calculate_weight_index<1>( + g, k, c, wei_spatial, wei_strides); + + v_acc += type_convert(p_in[in_idx]) * + type_convert(p_wei[wei_idx]); + } + } + } + else if constexpr(NDimSpatial == 2) + { + for(ck_tile::index_t y = 0; y < wei_spatial_lengths[0]; ++y) + { + ck_tile::long_index_t hi = + static_cast(out_spatial_idx[0] * + conv_strides[0]) + + static_cast(y * conv_dilations[0]) - + static_cast(in_left_pads[0]); + + for(ck_tile::index_t x = 0; x < wei_spatial_lengths[1]; ++x) + { + ck_tile::long_index_t wi = + static_cast(out_spatial_idx[1] * + conv_strides[1]) + + static_cast(x * conv_dilations[1]) - + static_cast(in_left_pads[1]); + + // Bounds check + if(hi >= 0 && hi < in_spatial_lengths[0] && wi >= 0 && + wi < in_spatial_lengths[1]) + { + std::array in_spatial = { + static_cast(hi), static_cast(wi)}; + std::array wei_spatial = {y, x}; + ck_tile::long_index_t in_idx = detail::calculate_input_index<2>( + n, g, c, in_spatial, in_strides); + ck_tile::long_index_t wei_idx = detail::calculate_weight_index<2>( + g, k, c, wei_spatial, wei_strides); + + v_acc += type_convert(p_in[in_idx]) * + type_convert(p_wei[wei_idx]); + } + } + } + } + else if constexpr(NDimSpatial == 3) + { + for(ck_tile::index_t z = 0; z < wei_spatial_lengths[0]; ++z) + { + ck_tile::long_index_t di = + static_cast(out_spatial_idx[0] * + conv_strides[0]) + + static_cast(z * conv_dilations[0]) - + static_cast(in_left_pads[0]); + + for(ck_tile::index_t y = 0; y < wei_spatial_lengths[1]; ++y) + { + ck_tile::long_index_t hi = + static_cast(out_spatial_idx[1] * + conv_strides[1]) + + static_cast(y * conv_dilations[1]) - + static_cast(in_left_pads[1]); + + for(ck_tile::index_t x = 0; x < wei_spatial_lengths[2]; ++x) + { + ck_tile::long_index_t wi = + static_cast(out_spatial_idx[2] * + conv_strides[2]) + + static_cast(x * conv_dilations[2]) - + static_cast(in_left_pads[2]); + + // Bounds check + if(di >= 0 && di < in_spatial_lengths[0] && hi >= 0 && + hi < in_spatial_lengths[1] && wi >= 0 && + wi < in_spatial_lengths[2]) + { + std::array in_spatial = { + static_cast(di), + static_cast(hi), + static_cast(wi)}; + std::array wei_spatial = {z, y, x}; + ck_tile::long_index_t in_idx = detail::calculate_input_index<3>( + n, g, c, in_spatial, in_strides); + ck_tile::long_index_t wei_idx = + detail::calculate_weight_index<3>( + g, k, c, wei_spatial, wei_strides); + + v_acc += type_convert(p_in[in_idx]) * + type_convert(p_wei[wei_idx]); + } + } + } + } + } + } + + // Convert accumulator to output type and write + p_out[ii] = type_convert(v_acc); + } + } +}; + +// Host-side launcher for naive grouped convolution forward +template +CK_TILE_HOST float naive_grouped_conv_fwd(const InDataType* p_in_dev, + const WeiDataType* p_wei_dev, + OutDataType* p_out_dev, + ck_tile::index_t G, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t C, + std::vector in_spatial_lengths, + std::vector wei_spatial_lengths, + std::vector out_spatial_lengths, + std::vector conv_strides, + std::vector conv_dilations, + std::vector in_left_pads, + ck_tile::stream_config stream_config = {}) +{ + // Convert vectors to arrays (std::array can be passed by value to kernel) + auto in_spatial_arr = to_array_with_default(in_spatial_lengths); + auto wei_spatial_arr = to_array_with_default(wei_spatial_lengths); + auto out_spatial_arr = to_array_with_default(out_spatial_lengths); + auto conv_strides_arr = to_array_with_default(conv_strides); + auto conv_dilations_arr = to_array_with_default(conv_dilations); + auto in_left_pads_arr = to_array_with_default(in_left_pads, 0); + + // Calculate grid size + ck_tile::long_index_t output_length = G * N * K; + for(ck_tile::index_t i = 0; i < NDimSpatial; ++i) + { + output_length *= out_spatial_lengths[i]; + } + + using KernelType = + naive_grouped_conv_fwd_kernel; + + constexpr ck_tile::index_t block_size = KernelType::kBlockSize; + const ck_tile::index_t grid_size = (output_length + block_size - 1) / block_size; + + // Launch kernel + float elapsed_ms = launch_kernel(stream_config, + make_kernel(KernelType{}, + dim3(grid_size), + dim3(block_size), + 0, // dynamic shared memory size + p_in_dev, + p_wei_dev, + p_out_dev, + G, + N, + K, + C, + in_spatial_arr, + wei_spatial_arr, + out_spatial_arr, + conv_strides_arr, + conv_dilations_arr, + in_left_pads_arr)); + + return elapsed_ms; +} + +} // namespace ck_tile diff --git a/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp b/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp new file mode 100644 index 0000000000..285271c6ef --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp @@ -0,0 +1,73 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#ifndef CONV_COMMON_HPP +#define CONV_COMMON_HPP + +#include "ck/ck.hpp" + +namespace ck { +namespace ref { + +// Device-compatible dimension structure for GPU reference kernels +// Replaces passing 24 individual parameters +struct ConvDims +{ + index_t N, K, C; + index_t Di, Hi, Wi; + index_t Z, Y, X; + index_t Do, Ho, Wo; + index_t stride_z, stride_y, stride_x; + index_t dilation_z, dilation_y, dilation_x; + index_t pad_z, pad_y, pad_x; +}; + +} // namespace ref + +// Helper function to extract dimensions from ConvParam for GPU kernels +// Defined in ck::utils::conv namespace for convenience +namespace utils { +namespace conv { + +inline ck::ref::ConvDims +extract_conv_dims(const ConvParam& conv_param, ck::index_t NDimSpatial, bool apply_group = true) +{ + ck::ref::ConvDims dims; + dims.N = conv_param.N_; + dims.K = conv_param.K_; + dims.C = apply_group ? (conv_param.C_ * conv_param.G_) : conv_param.C_; + + dims.Di = (NDimSpatial >= 3) ? conv_param.input_spatial_lengths_[0] : 1; + dims.Hi = (NDimSpatial >= 2) ? conv_param.input_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1; + dims.Wi = conv_param.input_spatial_lengths_[NDimSpatial - 1]; + + dims.Z = (NDimSpatial >= 3) ? conv_param.filter_spatial_lengths_[0] : 1; + dims.Y = (NDimSpatial >= 2) ? conv_param.filter_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1; + dims.X = conv_param.filter_spatial_lengths_[NDimSpatial - 1]; + + dims.Do = (NDimSpatial >= 3) ? conv_param.output_spatial_lengths_[0] : 1; + dims.Ho = (NDimSpatial >= 2) ? conv_param.output_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1; + dims.Wo = conv_param.output_spatial_lengths_[NDimSpatial - 1]; + + dims.stride_z = (NDimSpatial >= 3) ? conv_param.conv_filter_strides_[0] : 1; + dims.stride_y = + (NDimSpatial >= 2) ? conv_param.conv_filter_strides_[NDimSpatial >= 3 ? 1 : 0] : 1; + dims.stride_x = conv_param.conv_filter_strides_[NDimSpatial - 1]; + + dims.dilation_z = (NDimSpatial >= 3) ? conv_param.conv_filter_dilations_[0] : 1; + dims.dilation_y = + (NDimSpatial >= 2) ? conv_param.conv_filter_dilations_[NDimSpatial >= 3 ? 1 : 0] : 1; + dims.dilation_x = conv_param.conv_filter_dilations_[NDimSpatial - 1]; + + dims.pad_z = (NDimSpatial >= 3) ? conv_param.input_left_pads_[0] : 0; + dims.pad_y = (NDimSpatial >= 2) ? conv_param.input_left_pads_[NDimSpatial >= 3 ? 1 : 0] : 0; + dims.pad_x = conv_param.input_left_pads_[NDimSpatial - 1]; + + return dims; +} + +} // namespace conv +} // namespace utils +} // namespace ck + +#endif diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp new file mode 100644 index 0000000000..686b1a0d34 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp @@ -0,0 +1,149 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck/utility/type_convert.hpp" +#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp" + +namespace ck { +namespace ref { + +/* + * \brief naive implementation of 3D convolution backward data. + * Layout is (NDHWC, KZYXC, NDHWK). + * Computes gradient with respect to input. + * + * \param N number of batches + * \param K number of filters (output channels) + * \param C number of input channels + * \param (Di, Hi, Wi) depth, height and width dimension of input + * \param (Z, Y, X) depth, height and width dimensions of filter + * \param (Do, Ho, Wo) depth, height and width dimension of output + * \param (stride_z, stride_y, stride_x) strides + * \param (dilation_z, dilation_y, dilation_x) dilations + * \param (pad_z, pad_y, pad_x) pads + */ +template +__global__ void naive_conv_bwd_data_ndhwc_kzyxc_ndhwk(TIn* __restrict__ p_in_grad, + const TWei* __restrict__ p_wei, + const TOut* __restrict__ p_out_grad, + const ConvDims dims) +{ + const index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const index_t num_threads = blockDim.x * gridDim.x; + const long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C; + + const index_t in_strides[] = { + dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C}; + const index_t out_strides[] = { + dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K}; + const index_t wei_strides[] = { + dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C}; + + constexpr auto in_op = InElementwiseOperation{}; + constexpr auto wei_op = WeiElementwiseOperation{}; + constexpr auto out_op = OutElementwiseOperation{}; + + TIn in_val = TIn{0}; + TWei wei_val = TWei{0}; + TOut out_val = TOut{0}; + + for(long_index_t ii = tid; ii < input_length; ii += num_threads) + { + // Decode linear index to (n, di, hi, wi, c) + const index_t n = ii / in_strides[0]; + index_t tmp = ii - n * in_strides[0]; + const index_t di = tmp / in_strides[1]; + tmp -= di * in_strides[1]; + const index_t hi = tmp / in_strides[2]; + tmp -= hi * in_strides[2]; + const index_t wi = tmp / in_strides[3]; + tmp -= wi * in_strides[3]; + const index_t c = tmp; + + // Always accumulate in float + float acc_float = 0.0f; + + const TOut* out_n = p_out_grad + static_cast(n) * out_strides[0]; + + // Loop over output channels + for(index_t k = 0; k < dims.K; ++k) + { + const TWei* wei_k = p_wei + static_cast(k) * wei_strides[0]; + + // Loop over filter dimensions + for(index_t z = 0; z < dims.Z; ++z) + { + // Calculate output position from input position (inverse of forward) + index_t d_tmp = di + dims.pad_z - z * dims.dilation_z; + if(d_tmp % dims.stride_z != 0) + continue; + index_t d_o = d_tmp / dims.stride_z; + if(d_o < 0 || d_o >= dims.Do) + continue; + + const TOut* out_n_do = out_n + d_o * out_strides[1]; + const TWei* wei_k_z = wei_k + z * wei_strides[1]; + + for(index_t y = 0; y < dims.Y; ++y) + { + index_t h_tmp = hi + dims.pad_y - y * dims.dilation_y; + if(h_tmp % dims.stride_y != 0) + continue; + index_t ho = h_tmp / dims.stride_y; + if(ho < 0 || ho >= dims.Ho) + continue; + + const TOut* out_n_do_ho = out_n_do + ho * out_strides[2]; + const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2]; + + for(index_t x = 0; x < dims.X; ++x) + { + index_t w_tmp = wi + dims.pad_x - x * dims.dilation_x; + if(w_tmp % dims.stride_x != 0) + continue; + index_t wo = w_tmp / dims.stride_x; + if(wo < 0 || wo >= dims.Wo) + continue; + + const TOut* out_n_do_ho_wo = out_n_do_ho + wo * out_strides[3]; + const TWei* wei_k_z_y_x = wei_k_z_y + x * wei_strides[3]; + + // Load values from memory + TOut out_loaded = out_n_do_ho_wo[k]; + TWei wei_loaded = wei_k_z_y_x[c]; + + // Apply element-wise operations (like forward does) + out_op(out_val, out_loaded); + wei_op(wei_val, wei_loaded); + + // Convert to float for multiplication + float out_f = type_convert(out_val); + float wei_f = type_convert(wei_val); + + acc_float += out_f * wei_f; + } + } + } + } + + // Convert float accumulator to TAcc, then to input type + TAcc acc = type_convert(acc_float); + TIn result = type_convert(acc); + + // Apply input element-wise operation (if any) + in_op(in_val, result); + + // Write transformed result + p_in_grad[ii] = in_val; + } +} +} // namespace ref +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp new file mode 100644 index 0000000000..ff44c9fcbd --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp @@ -0,0 +1,139 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck/utility/type_convert.hpp" +#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp" + +namespace ck { +namespace ref { + +/* + * \brief naive implementation of 3D convolution backward weight. + * Layout is (NDHWC, KZYXC, NDHWK). + * Computes gradient with respect to weights. + * + * \param N number of batches + * \param K number of filters (output channels) + * \param C number of input channels + * \param (Di, Hi, Wi) depth, height and width dimension of input + * \param (Z, Y, X) depth, height and width dimensions of filter + * \param (Do, Ho, Wo) depth, height and width dimension of output + * \param (stride_z, stride_y, stride_x) strides + * \param (dilation_z, dilation_y, dilation_x) dilations + * \param (pad_z, pad_y, pad_x) pads + */ +template +__global__ void naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in, + TWei* __restrict__ p_wei_grad, + const TOut* __restrict__ p_out_grad, + const ConvDims dims) +{ + const index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const index_t num_threads = blockDim.x * gridDim.x; + const long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C; + + const index_t in_strides[] = { + dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C}; + const index_t out_strides[] = { + dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K}; + const index_t wei_strides[] = { + dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C}; + + constexpr auto in_op = InElementwiseOperation{}; + constexpr auto wei_op = WeiElementwiseOperation{}; + constexpr auto out_op = OutElementwiseOperation{}; + + TIn in_val = TIn{0}; + TWei wei_val = TWei{0}; + TOut out_val = TOut{0}; + + for(long_index_t ii = tid; ii < weight_length; ii += num_threads) + { + // Decode linear index to (k, z, y, x, c) + const index_t k = ii / wei_strides[0]; + index_t tmp = ii - k * wei_strides[0]; + const index_t z = tmp / wei_strides[1]; + tmp -= z * wei_strides[1]; + const index_t y = tmp / wei_strides[2]; + tmp -= y * wei_strides[2]; + const index_t x = tmp / wei_strides[3]; + tmp -= x * wei_strides[3]; + const index_t c = tmp; + + // Always accumulate in float + float acc_float = 0.0f; + + // Loop over batch + for(index_t n = 0; n < dims.N; ++n) + { + const TIn* in_n = p_in + static_cast(n) * in_strides[0]; + const TOut* out_n = p_out_grad + static_cast(n) * out_strides[0]; + + // Loop over output spatial dimensions + for(index_t d_o = 0; d_o < dims.Do; ++d_o) + { + // Calculate input position from output position + index_t di = d_o * dims.stride_z - dims.pad_z + z * dims.dilation_z; + if(di < 0 || di >= dims.Di) + continue; + + const TIn* in_n_di = in_n + di * in_strides[1]; + const TOut* out_n_do = out_n + d_o * out_strides[1]; + + for(index_t ho = 0; ho < dims.Ho; ++ho) + { + index_t hi = ho * dims.stride_y - dims.pad_y + y * dims.dilation_y; + if(hi < 0 || hi >= dims.Hi) + continue; + + const TIn* in_n_di_hi = in_n_di + hi * in_strides[2]; + const TOut* out_n_do_ho = out_n_do + ho * out_strides[2]; + + for(index_t wo = 0; wo < dims.Wo; ++wo) + { + index_t wi = wo * dims.stride_x - dims.pad_x + x * dims.dilation_x; + if(wi < 0 || wi >= dims.Wi) + continue; + + // Load values from memory (like forward does) + const TIn* in_ptr = in_n_di_hi + wi * in_strides[3]; + const TOut* out_ptr = out_n_do_ho + wo * out_strides[3]; + + TIn in_loaded = in_ptr[c]; + TOut out_loaded = out_ptr[k]; + + // Apply element-wise operations + in_op(in_val, in_loaded); + out_op(out_val, out_loaded); + + // Convert to float for multiplication + float in_f = type_convert(in_val); + float out_f = type_convert(out_val); + + acc_float += out_f * in_f; + } + } + } + } + + // Convert float accumulator to TAcc, then to weight type + TAcc acc = type_convert(acc_float); + TWei result = type_convert(acc); + + // Apply weight element-wise operation (if any) + wei_op(wei_val, result); + + // Write transformed result + p_wei_grad[ii] = wei_val; + } +} +} // namespace ref +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp similarity index 51% rename from library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp rename to library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp index 8e2e63d527..defbbd5be4 100644 --- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp @@ -1,8 +1,10 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#ifndef NAIVE_CONV_FWD_HPP -#define NAIVE_CONV_FWD_HPP +#pragma once + +#include "ck/utility/type_convert.hpp" +#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp" namespace ck { namespace ref { @@ -30,43 +32,26 @@ template (0); + // Always accumulate in float (FP8/BF8 don't support arithmetic) + float acc_float = 0.0f; const TIn* in_n = p_in + static_cast(n) * in_strides[0]; const TWei* wei_k = p_wei + static_cast(k) * wei_strides[0]; - for(index_t z = 0; z < Z; ++z) + for(index_t z = 0; z < dims.Z; ++z) { - index_t di = stride_z * dO - pad_z + dilation_z * z; + index_t di = dims.stride_z * dO - dims.pad_z + dims.dilation_z * z; const TIn* in_n_di = in_n + di * in_strides[1]; const TWei* wei_k_z = wei_k + z * wei_strides[1]; - for(index_t y = 0; y < Y; ++y) + for(index_t y = 0; y < dims.Y; ++y) { - index_t hi = stride_y * ho - pad_y + dilation_y * y; + index_t hi = dims.stride_y * ho - dims.pad_y + dims.dilation_y * y; const TIn* in_n_di_hi = in_n_di + hi * in_strides[2]; const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2]; - for(index_t x = 0; x < X; ++x) + for(index_t x = 0; x < dims.X; ++x) { - index_t wi = stride_x * wo - pad_x + dilation_x * x; + index_t wi = dims.stride_x * wo - dims.pad_x + dims.dilation_x * x; const TIn* in_n_di_hi_wi = in_n_di_hi + wi * in_strides[3]; const TWei* wei_k_z_y_x = wei_k_z_y + x * wei_strides[3]; - if(di >= 0 && di < Di && hi >= 0 && hi < Hi && wi >= 0 && wi < Wi) + if(di >= 0 && di < dims.Di && hi >= 0 && hi < dims.Hi && wi >= 0 && + wi < dims.Wi) { - for(index_t c = 0; c < C; ++c) + for(index_t c = 0; c < dims.C; ++c) { - in_op(in_val, in_n_di_hi_wi[c]); - wei_op(wei_val, wei_k_z_y_x[c]); - acc += in_val * wei_val; + // Load values from memory + TIn in_loaded = in_n_di_hi_wi[c]; + TWei wei_loaded = wei_k_z_y_x[c]; + + // Apply element-wise operations + in_op(in_val, in_loaded); + wei_op(wei_val, wei_loaded); + + // Always convert to float for multiplication (FP8/BF8 don't support + // direct arithmetic) + float in_f = type_convert(in_val); + float wei_f = type_convert(wei_val); + + // Accumulate in float + acc_float += in_f * wei_f; } } } } } - out_op(out_val, static_cast(acc)); + // Convert float accumulator to TAcc, then to output type + TAcc acc = type_convert(acc_float); + TOut result = type_convert(acc); + + // Apply output element-wise operation (if any) + out_op(out_val, result); + + // Write transformed result p_out[ii] = out_val; } } } // namespace ref } // namespace ck - -#endif