mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
[CK] Integrate GPU reference into ckProfiler for convolutions (#3379)
Refactor and integrate CK GPU references into ckProfiler.
- All convolution layouts and groupings supported for all three directions
- Unit tests verifying GPU and CPU reference is the same
- Support added to profiler (do_verification = 2 enables GPU reference)
- One profiler-based test per direction changed to GPU reference to demonstrate usag
Closes AICK-427
[ROCm/composable_kernel commit: bb8445dca8]
This commit is contained in:
@@ -131,6 +131,9 @@ template <ck::index_t NDimSpatial,
|
||||
typename WeiElementOp,
|
||||
typename OutElementOp,
|
||||
typename DeviceConvNDFwdInstance,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename ComputeDataType = OutDataType>
|
||||
bool run_grouped_conv_fwd(int do_verification,
|
||||
int init_method,
|
||||
@@ -283,31 +286,25 @@ bool run_grouped_conv_fwd(int do_verification,
|
||||
DeviceMem out_device_ref_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
|
||||
out_device_ref_buf.SetZero();
|
||||
|
||||
// Extract dimensions using helper function
|
||||
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
|
||||
|
||||
// Launch GPU reference kernel
|
||||
constexpr ck::index_t block_size = 256;
|
||||
const ck::long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K;
|
||||
const ck::index_t grid_size = (output_length + block_size - 1) / block_size;
|
||||
|
||||
auto gpu_ref_kernel = ck::ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
ComputeDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>;
|
||||
|
||||
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
|
||||
// Call GPU reference with ConvParam directly, using the correct layout types
|
||||
ck::ref::naive_conv_fwd<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<OutDataType*>(out_device_ref_buf.GetDeviceBuffer()),
|
||||
dims);
|
||||
conv_param);
|
||||
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "GPU reference kernel completed successfully, copying results..." << std::endl;
|
||||
std::cout << "GPU reference function completed successfully, copying results..."
|
||||
<< std::endl;
|
||||
|
||||
// Copy GPU reference result to host
|
||||
out_device_ref_buf.FromDevice(out_host.mData.data());
|
||||
|
||||
@@ -12,7 +12,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
|
||||
{
|
||||
print_helper_msg();
|
||||
|
||||
int do_verification = 1; // 0=no, 1=CPU, 2=GPU
|
||||
int do_verification = 2; // 0=no, 1=CPU, 2=GPU
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
@@ -71,6 +71,9 @@ bool run_convnd_fwd_example(int argc, char* argv[])
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
ComputeDataType>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
|
||||
@@ -18,7 +18,8 @@
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
|
||||
using ::ck::DeviceMem;
|
||||
using ::ck::HostTensorDescriptor;
|
||||
@@ -81,7 +82,10 @@ template <ck::index_t NDimSpatial,
|
||||
typename InElementOp,
|
||||
typename WeiElementOp,
|
||||
typename OutElementOp,
|
||||
typename DeviceConvNdBwdDataInstance>
|
||||
typename DeviceConvNdBwdDataInstance,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout>
|
||||
int run_conv_bwd_data(int do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
@@ -225,50 +229,52 @@ int run_conv_bwd_data(int do_verification,
|
||||
}
|
||||
else if(do_verification == 2)
|
||||
{
|
||||
// GPU verification
|
||||
// GPU verification using naive GPU reference
|
||||
std::cout << "Running GPU verification..." << std::endl;
|
||||
|
||||
// Allocate and ZERO GPU memory for reference input
|
||||
DeviceMem in_device_ref_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
|
||||
in_device_ref_buf.SetZero();
|
||||
|
||||
// Extract dimensions using helper function
|
||||
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
|
||||
|
||||
constexpr ck::index_t block_size = 256;
|
||||
const ck::long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C;
|
||||
const ck::index_t grid_size = (input_length + block_size - 1) / block_size;
|
||||
|
||||
auto gpu_ref_kernel = ck::ref::naive_conv_bwd_data_ndhwc_kzyxc_ndhwk<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
float,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>;
|
||||
|
||||
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
|
||||
// Call GPU reference with ConvParam directly, using the correct layout types
|
||||
ck::ref::naive_conv_bwd_data<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<InDataType*>(in_device_ref_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
dims);
|
||||
conv_param,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "GPU reference kernel completed, copying results..." << std::endl;
|
||||
std::cout << "GPU reference function completed successfully, copying results..."
|
||||
<< std::endl;
|
||||
|
||||
// Copy GPU reference result
|
||||
// Copy GPU reference result to host
|
||||
Tensor<InDataType> in_gpu_ref(in_host.mDesc);
|
||||
in_device_ref_buf.FromDevice(in_gpu_ref.mData.data());
|
||||
|
||||
// Copy optimized kernel result
|
||||
// Copy GPU kernel result to host
|
||||
in_device_buf.FromDevice(in_device.mData.data());
|
||||
|
||||
std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl;
|
||||
|
||||
// Compare: Optimized kernel result vs GPU reference result
|
||||
bool pass = ck::utils::check_err(in_device,
|
||||
in_gpu_ref,
|
||||
"Error: Incorrect results!",
|
||||
get_rtol<InDataType, float>(),
|
||||
get_atol<InDataType, float>());
|
||||
|
||||
std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
|
||||
|
||||
return pass ? 0 : 1;
|
||||
|
||||
@@ -92,16 +92,19 @@ int main(int argc, char* argv[])
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
DeviceConvNdBwdDataInstance<1>>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
DeviceConvNdBwdDataInstance<1>,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 2)
|
||||
{
|
||||
@@ -128,16 +131,19 @@ int main(int argc, char* argv[])
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
DeviceConvNdBwdDataInstance<2>>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
DeviceConvNdBwdDataInstance<2>,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 3)
|
||||
{
|
||||
@@ -164,16 +170,19 @@ int main(int argc, char* argv[])
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
DeviceConvNdBwdDataInstance<3>>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
DeviceConvNdBwdDataInstance<3>,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -119,16 +119,19 @@ int main(int argc, char* argv[])
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
DeviceConvNdBwdDataInstance<1>>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
DeviceConvNdBwdDataInstance<1>,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 2)
|
||||
{
|
||||
@@ -155,16 +158,19 @@ int main(int argc, char* argv[])
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
DeviceConvNdBwdDataInstance<2>>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
DeviceConvNdBwdDataInstance<2>,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 3)
|
||||
{
|
||||
@@ -191,16 +197,19 @@ int main(int argc, char* argv[])
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
DeviceConvNdBwdDataInstance<3>>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
DeviceConvNdBwdDataInstance<3>,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -149,55 +149,53 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
|
||||
}
|
||||
else if(config.do_verification == 2)
|
||||
{
|
||||
// GPU verification (only supports G=1, standard convolution)
|
||||
if(conv_param.G_ != 1)
|
||||
{
|
||||
std::cout << "GPU verification only supports G=1 (standard convolution)" << std::endl;
|
||||
std::cout << "Current G=" << conv_param.G_ << " not supported." << std::endl;
|
||||
std::cout << "Use do_verification=1 for CPU verification with grouped convolution."
|
||||
<< std::endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::cout << "Running GPU verification (G=1)..." << std::endl;
|
||||
// GPU verification using naive GPU reference
|
||||
std::cout << "Running GPU verification..." << std::endl;
|
||||
|
||||
// Allocate and ZERO GPU memory for reference weights
|
||||
DeviceMem wei_device_ref_buf(sizeof(WeiDataType) *
|
||||
wei_device_result.mDesc.GetElementSpaceSize());
|
||||
wei_device_ref_buf.SetZero();
|
||||
|
||||
// Extract dimensions using helper function (G=1, standard convolution)
|
||||
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial, false);
|
||||
// Call GPU reference function with ConvParam and layout types
|
||||
using InLayout = InputLayout<NDimSpatial>;
|
||||
using WeiLayout = WeightLayout<NDimSpatial>;
|
||||
using OutLayout = OutputLayout<NDimSpatial>;
|
||||
|
||||
constexpr ck::index_t block_size = 256;
|
||||
const ck::long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C;
|
||||
const ck::index_t grid_size = (weight_length + block_size - 1) / block_size;
|
||||
|
||||
auto gpu_ref_kernel = ck::ref::naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
float,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>;
|
||||
|
||||
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
|
||||
ck::ref::naive_conv_bwd_weight<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<WeiDataType*>(wei_device_ref_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
dims);
|
||||
conv_param);
|
||||
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
std::cout << "GPU reference kernel completed, copying results..." << std::endl;
|
||||
std::cout << "GPU reference function completed successfully, copying results..."
|
||||
<< std::endl;
|
||||
|
||||
// Copy GPU reference result to host
|
||||
wei_device_ref_buf.FromDevice(wei_host_result.mData.data());
|
||||
|
||||
// Copy GPU kernel result to host
|
||||
wei_device_buf.FromDevice(wei_device_result.mData.data());
|
||||
|
||||
std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl;
|
||||
|
||||
// Compare: Optimized kernel result vs GPU reference result
|
||||
bool pass = ck::utils::check_err(wei_device_result.mData,
|
||||
wei_host_result.mData,
|
||||
"Error: Incorrect results!",
|
||||
get_rtol<WeiDataType, float>(),
|
||||
get_atol<WeiDataType, float>());
|
||||
|
||||
std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
|
||||
|
||||
return pass;
|
||||
|
||||
@@ -1,353 +0,0 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// Standalone test program for Old CK GPU references
|
||||
// Tests naive_conv_fwd (existing) and future backward ops
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
|
||||
// CPU reference for validation
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
// GPU reference (OLD CK - already exists!)
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <index_t NDimSpatial>
|
||||
struct ConvParams
|
||||
{
|
||||
index_t N, K, C;
|
||||
std::vector<index_t> input_spatial;
|
||||
std::vector<index_t> filter_spatial;
|
||||
std::vector<index_t> output_spatial;
|
||||
std::vector<index_t> strides;
|
||||
std::vector<index_t> dilations;
|
||||
std::vector<index_t> pads;
|
||||
};
|
||||
|
||||
template <index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType>
|
||||
bool test_conv_forward_gpu_ref(const ConvParams<NDimSpatial>& params, const std::string& test_name)
|
||||
{
|
||||
std::cout << "[TEST] " << test_name << std::endl;
|
||||
|
||||
// Calculate dimensions
|
||||
const index_t N = params.N;
|
||||
const index_t K = params.K;
|
||||
const index_t C = params.C;
|
||||
|
||||
// Create tensor descriptors (NDHWC layout for old CK)
|
||||
std::vector<index_t> in_lengths = {N};
|
||||
for(auto d : params.input_spatial)
|
||||
in_lengths.push_back(d);
|
||||
in_lengths.push_back(C);
|
||||
|
||||
std::vector<index_t> wei_lengths = {K};
|
||||
for(auto d : params.filter_spatial)
|
||||
wei_lengths.push_back(d);
|
||||
wei_lengths.push_back(C);
|
||||
|
||||
std::vector<index_t> out_lengths = {N};
|
||||
for(auto d : params.output_spatial)
|
||||
out_lengths.push_back(d);
|
||||
out_lengths.push_back(K);
|
||||
|
||||
// Create host tensors
|
||||
Tensor<InDataType> input(in_lengths);
|
||||
Tensor<WeiDataType> weight(wei_lengths);
|
||||
Tensor<OutDataType> output_gpu(out_lengths);
|
||||
Tensor<OutDataType> output_ref(out_lengths);
|
||||
|
||||
// Initialize with random data
|
||||
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
|
||||
// Allocate device memory
|
||||
DeviceMem input_dev(input.mData.size() * sizeof(InDataType));
|
||||
DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType));
|
||||
DeviceMem output_dev(output_gpu.mData.size() * sizeof(OutDataType));
|
||||
|
||||
// Copy to device
|
||||
input_dev.ToDevice(input.mData.data());
|
||||
weight_dev.ToDevice(weight.mData.data());
|
||||
|
||||
// Run CPU reference for validation
|
||||
auto ref_conv =
|
||||
tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
tensor_operation::element_wise::PassThrough,
|
||||
tensor_operation::element_wise::PassThrough,
|
||||
tensor_operation::element_wise::PassThrough>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_arg = ref_conv.MakeArgument(input.mData.data(),
|
||||
weight.mData.data(),
|
||||
output_ref.mData.data(),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
params.input_spatial,
|
||||
params.filter_spatial,
|
||||
params.output_spatial,
|
||||
params.strides,
|
||||
params.dilations,
|
||||
params.pads,
|
||||
params.pads,
|
||||
{},
|
||||
{},
|
||||
{});
|
||||
|
||||
ref_invoker.Run(ref_arg);
|
||||
|
||||
// Run GPU reference (OLD CK)
|
||||
using InElementOp = tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
constexpr index_t block_size = 256;
|
||||
|
||||
// Extract dimensions based on NDimSpatial
|
||||
index_t Di = 1, Hi = 1, Wi = 1;
|
||||
index_t Z = 1, Y = 1, X = 1;
|
||||
index_t Do = 1, Ho = 1, Wo = 1;
|
||||
index_t stride_z = 1, stride_y = 1, stride_x = 1;
|
||||
index_t dilation_z = 1, dilation_y = 1, dilation_x = 1;
|
||||
index_t pad_z = 0, pad_y = 0, pad_x = 0;
|
||||
|
||||
if(NDimSpatial == 1)
|
||||
{
|
||||
Wi = params.input_spatial[0];
|
||||
X = params.filter_spatial[0];
|
||||
Wo = params.output_spatial[0];
|
||||
stride_x = params.strides[0];
|
||||
dilation_x = params.dilations[0];
|
||||
pad_x = params.pads[0];
|
||||
}
|
||||
else if(NDimSpatial == 2)
|
||||
{
|
||||
Hi = params.input_spatial[0];
|
||||
Wi = params.input_spatial[1];
|
||||
Y = params.filter_spatial[0];
|
||||
X = params.filter_spatial[1];
|
||||
Ho = params.output_spatial[0];
|
||||
Wo = params.output_spatial[1];
|
||||
stride_y = params.strides[0];
|
||||
stride_x = params.strides[1];
|
||||
dilation_y = params.dilations[0];
|
||||
dilation_x = params.dilations[1];
|
||||
pad_y = params.pads[0];
|
||||
pad_x = params.pads[1];
|
||||
}
|
||||
else if(NDimSpatial == 3)
|
||||
{
|
||||
Di = params.input_spatial[0];
|
||||
Hi = params.input_spatial[1];
|
||||
Wi = params.input_spatial[2];
|
||||
Z = params.filter_spatial[0];
|
||||
Y = params.filter_spatial[1];
|
||||
X = params.filter_spatial[2];
|
||||
Do = params.output_spatial[0];
|
||||
Ho = params.output_spatial[1];
|
||||
Wo = params.output_spatial[2];
|
||||
stride_z = params.strides[0];
|
||||
stride_y = params.strides[1];
|
||||
stride_x = params.strides[2];
|
||||
dilation_z = params.dilations[0];
|
||||
dilation_y = params.dilations[1];
|
||||
dilation_x = params.dilations[2];
|
||||
pad_z = params.pads[0];
|
||||
pad_y = params.pads[1];
|
||||
pad_x = params.pads[2];
|
||||
}
|
||||
|
||||
// Launch GPU reference kernel
|
||||
const long_index_t output_length = N * Do * Ho * Wo * K;
|
||||
const index_t grid_size = (output_length + block_size - 1) / block_size;
|
||||
|
||||
hipLaunchKernelGGL(ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
float,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>,
|
||||
dim3(grid_size),
|
||||
dim3(block_size),
|
||||
0,
|
||||
nullptr,
|
||||
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
Di,
|
||||
Hi,
|
||||
Wi,
|
||||
Z,
|
||||
Y,
|
||||
X,
|
||||
Do,
|
||||
Ho,
|
||||
Wo,
|
||||
stride_z,
|
||||
stride_y,
|
||||
stride_x,
|
||||
dilation_z,
|
||||
dilation_y,
|
||||
dilation_x,
|
||||
pad_z,
|
||||
pad_y,
|
||||
pad_x);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
|
||||
// Copy result back
|
||||
output_dev.FromDevice(output_gpu.mData.data());
|
||||
|
||||
// Compare GPU ref vs CPU ref
|
||||
bool pass = check_err(output_gpu.mData, output_ref.mData, "GPU vs CPU ref", 1e-3, 1e-3);
|
||||
|
||||
std::cout << " Result: " << (pass ? "✅ PASS" : "❌ FAIL") << std::endl;
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::cout << "========================================" << std::endl;
|
||||
std::cout << "Old CK GPU Reference Test Program" << std::endl;
|
||||
std::cout << "========================================" << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
int passed = 0;
|
||||
int failed = 0;
|
||||
|
||||
// Test 1: 2D Conv, FP16, Small
|
||||
{
|
||||
ConvParams<2> params;
|
||||
params.N = 2;
|
||||
params.K = 8;
|
||||
params.C = 8;
|
||||
params.input_spatial = {7, 7};
|
||||
params.filter_spatial = {3, 3};
|
||||
params.output_spatial = {5, 5};
|
||||
params.strides = {1, 1};
|
||||
params.dilations = {1, 1};
|
||||
params.pads = {0, 0};
|
||||
|
||||
if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Small"))
|
||||
passed++;
|
||||
else
|
||||
failed++;
|
||||
}
|
||||
|
||||
// Test 2: 2D Conv, FP32, Medium
|
||||
{
|
||||
ConvParams<2> params;
|
||||
params.N = 4;
|
||||
params.K = 16;
|
||||
params.C = 16;
|
||||
params.input_spatial = {14, 14};
|
||||
params.filter_spatial = {3, 3};
|
||||
params.output_spatial = {12, 12};
|
||||
params.strides = {1, 1};
|
||||
params.dilations = {1, 1};
|
||||
params.pads = {0, 0};
|
||||
|
||||
if(test_conv_forward_gpu_ref<2, float, float, float>(params, "2D-FP32-Medium"))
|
||||
passed++;
|
||||
else
|
||||
failed++;
|
||||
}
|
||||
|
||||
// Test 3: 1D Conv, FP16
|
||||
{
|
||||
ConvParams<1> params;
|
||||
params.N = 2;
|
||||
params.K = 8;
|
||||
params.C = 8;
|
||||
params.input_spatial = {16};
|
||||
params.filter_spatial = {3};
|
||||
params.output_spatial = {14};
|
||||
params.strides = {1};
|
||||
params.dilations = {1};
|
||||
params.pads = {0};
|
||||
|
||||
if(test_conv_forward_gpu_ref<1, half_t, half_t, half_t>(params, "1D-FP16"))
|
||||
passed++;
|
||||
else
|
||||
failed++;
|
||||
}
|
||||
|
||||
// Test 4: 3D Conv, FP16, Small
|
||||
{
|
||||
ConvParams<3> params;
|
||||
params.N = 1;
|
||||
params.K = 8;
|
||||
params.C = 8;
|
||||
params.input_spatial = {5, 5, 5};
|
||||
params.filter_spatial = {3, 3, 3};
|
||||
params.output_spatial = {3, 3, 3};
|
||||
params.strides = {1, 1, 1};
|
||||
params.dilations = {1, 1, 1};
|
||||
params.pads = {0, 0, 0};
|
||||
|
||||
if(test_conv_forward_gpu_ref<3, half_t, half_t, half_t>(params, "3D-FP16-Small"))
|
||||
passed++;
|
||||
else
|
||||
failed++;
|
||||
}
|
||||
|
||||
// Test 5: 2D Conv with stride
|
||||
{
|
||||
ConvParams<2> params;
|
||||
params.N = 2;
|
||||
params.K = 8;
|
||||
params.C = 8;
|
||||
params.input_spatial = {8, 8};
|
||||
params.filter_spatial = {3, 3};
|
||||
params.output_spatial = {3, 3};
|
||||
params.strides = {2, 2};
|
||||
params.dilations = {1, 1};
|
||||
params.pads = {0, 0};
|
||||
|
||||
if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Stride2"))
|
||||
passed++;
|
||||
else
|
||||
failed++;
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "========================================" << std::endl;
|
||||
std::cout << "SUMMARY" << std::endl;
|
||||
std::cout << "========================================" << std::endl;
|
||||
std::cout << "Total: " << (passed + failed) << std::endl;
|
||||
std::cout << "Passed: " << passed << " ✅" << std::endl;
|
||||
std::cout << "Failed: " << failed << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
if(failed == 0)
|
||||
{
|
||||
std::cout << "🎉 ALL TESTS PASSED!" << std::endl;
|
||||
std::cout << "Old CK Forward GPU Reference: WORKING ✅" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "❌ SOME TESTS FAILED" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@@ -7,11 +7,12 @@
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include "conv_util.hpp"
|
||||
#include "device.hpp"
|
||||
#include "device_conv_fwd.hpp"
|
||||
#include "common_header.hpp"
|
||||
#include "naive_conv_fwd_gpu.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/stream_config.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -26,7 +27,16 @@ template <typename InDataType,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation>
|
||||
struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K
|
||||
: public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
|
||||
: public DeviceConvFwd<3,
|
||||
ck::tensor_layout::convolution::NDHWC,
|
||||
ck::tensor_layout::convolution::KZYXC,
|
||||
ck::tensor_layout::convolution::NDHWK,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
|
||||
{
|
||||
using DeviceOp = DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K;
|
||||
@@ -57,6 +67,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
WeiElementwiseOperation wei_element_op,
|
||||
OutElementwiseOperation out_element_op)
|
||||
: params_{3,
|
||||
1, // G (group count, always 1 for non-grouped)
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
@@ -78,7 +89,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
}
|
||||
|
||||
// private:
|
||||
utils::conv::ConvParams params_;
|
||||
utils::conv::ConvParam params_;
|
||||
std::vector<index_t> out_spatial_lengths_;
|
||||
|
||||
const InDataType* p_in_;
|
||||
@@ -97,46 +108,28 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto naive_conv3d_fwd =
|
||||
ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>;
|
||||
using InLayout = ck::tensor_layout::convolution::GNCDHW;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKCZYX;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNKDHW;
|
||||
|
||||
float ave_time = launch_and_time_kernel(stream_config,
|
||||
naive_conv3d_fwd,
|
||||
dim3(256),
|
||||
dim3(256),
|
||||
0,
|
||||
arg.p_in_,
|
||||
arg.p_wei_,
|
||||
arg.p_out_,
|
||||
arg.N_,
|
||||
arg.K_,
|
||||
arg.C_,
|
||||
arg.in_spatial_lengths_[0],
|
||||
arg.in_spatial_lengths_[1],
|
||||
arg.in_spatial_lengths_[2],
|
||||
arg.filter_spatial_lengths_[0],
|
||||
arg.filter_spatial_lengths_[1],
|
||||
arg.filter_spatial_lengths_[2],
|
||||
arg.out_spatial_lengths_[0],
|
||||
arg.out_spatial_lengths_[1],
|
||||
arg.out_spatial_lengths_[2],
|
||||
arg.conv_filter_strides_[0],
|
||||
arg.conv_filter_strides_[1],
|
||||
arg.conv_filter_strides_[2],
|
||||
arg.conv_filter_dilations_[0],
|
||||
arg.conv_filter_dilations_[1],
|
||||
arg.conv_filter_dilations_[2],
|
||||
arg.in_left_pads_[0],
|
||||
arg.in_left_pads_[1],
|
||||
arg.in_left_pads_[2]);
|
||||
|
||||
return ave_time;
|
||||
// Use simplified ConvParam-based API
|
||||
ref::naive_conv_fwd<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>(arg.p_in_,
|
||||
arg.p_wei_,
|
||||
arg.p_out_,
|
||||
arg.params_,
|
||||
arg.in_element_op_,
|
||||
arg.wei_element_op_,
|
||||
arg.out_element_op_,
|
||||
stream_config.stream_id_);
|
||||
return 0; // No timing for naive implementation
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
@@ -155,7 +148,9 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
|
||||
static bool IsSupportedArgument(const Argument& arg)
|
||||
{
|
||||
std::vector<index_t> out_spatial_lengths = arg.params_.GetOutputSpatialLengths();
|
||||
auto out_spatial_lengths_long = arg.params_.GetOutputSpatialLengths();
|
||||
std::vector<index_t> out_spatial_lengths(out_spatial_lengths_long.begin(),
|
||||
out_spatial_lengths_long.end());
|
||||
|
||||
bool out_lengths_are_consistent = out_spatial_lengths[0] == arg.out_spatial_lengths_[0] &&
|
||||
out_spatial_lengths[1] == arg.out_spatial_lengths_[1] &&
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#ifndef CONV_COMMON_HPP
|
||||
#define CONV_COMMON_HPP
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace ref {
|
||||
|
||||
// Device-compatible dimension structure for GPU reference kernels
|
||||
// Replaces passing 24 individual parameters
|
||||
struct ConvDims
|
||||
{
|
||||
index_t N, K, C;
|
||||
index_t Di, Hi, Wi;
|
||||
index_t Z, Y, X;
|
||||
index_t Do, Ho, Wo;
|
||||
index_t stride_z, stride_y, stride_x;
|
||||
index_t dilation_z, dilation_y, dilation_x;
|
||||
index_t pad_z, pad_y, pad_x;
|
||||
};
|
||||
|
||||
} // namespace ref
|
||||
|
||||
// Helper function to extract dimensions from ConvParam for GPU kernels
|
||||
// Defined in ck::utils::conv namespace for convenience
|
||||
namespace utils {
|
||||
namespace conv {
|
||||
|
||||
inline ck::ref::ConvDims
|
||||
extract_conv_dims(const ConvParam& conv_param, ck::index_t NDimSpatial, bool apply_group = true)
|
||||
{
|
||||
ck::ref::ConvDims dims;
|
||||
dims.N = conv_param.N_;
|
||||
dims.K = conv_param.K_;
|
||||
dims.C = apply_group ? (conv_param.C_ * conv_param.G_) : conv_param.C_;
|
||||
|
||||
dims.Di = (NDimSpatial >= 3) ? conv_param.input_spatial_lengths_[0] : 1;
|
||||
dims.Hi = (NDimSpatial >= 2) ? conv_param.input_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
|
||||
dims.Wi = conv_param.input_spatial_lengths_[NDimSpatial - 1];
|
||||
|
||||
dims.Z = (NDimSpatial >= 3) ? conv_param.filter_spatial_lengths_[0] : 1;
|
||||
dims.Y = (NDimSpatial >= 2) ? conv_param.filter_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
|
||||
dims.X = conv_param.filter_spatial_lengths_[NDimSpatial - 1];
|
||||
|
||||
dims.Do = (NDimSpatial >= 3) ? conv_param.output_spatial_lengths_[0] : 1;
|
||||
dims.Ho = (NDimSpatial >= 2) ? conv_param.output_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
|
||||
dims.Wo = conv_param.output_spatial_lengths_[NDimSpatial - 1];
|
||||
|
||||
dims.stride_z = (NDimSpatial >= 3) ? conv_param.conv_filter_strides_[0] : 1;
|
||||
dims.stride_y =
|
||||
(NDimSpatial >= 2) ? conv_param.conv_filter_strides_[NDimSpatial >= 3 ? 1 : 0] : 1;
|
||||
dims.stride_x = conv_param.conv_filter_strides_[NDimSpatial - 1];
|
||||
|
||||
dims.dilation_z = (NDimSpatial >= 3) ? conv_param.conv_filter_dilations_[0] : 1;
|
||||
dims.dilation_y =
|
||||
(NDimSpatial >= 2) ? conv_param.conv_filter_dilations_[NDimSpatial >= 3 ? 1 : 0] : 1;
|
||||
dims.dilation_x = conv_param.conv_filter_dilations_[NDimSpatial - 1];
|
||||
|
||||
dims.pad_z = (NDimSpatial >= 3) ? conv_param.input_left_pads_[0] : 0;
|
||||
dims.pad_y = (NDimSpatial >= 2) ? conv_param.input_left_pads_[NDimSpatial >= 3 ? 1 : 0] : 0;
|
||||
dims.pad_x = conv_param.input_left_pads_[NDimSpatial - 1];
|
||||
|
||||
return dims;
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace utils
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -4,146 +4,515 @@
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/type_convert.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace ref {
|
||||
|
||||
/*
|
||||
* \brief naive implementation of 3D convolution backward data.
|
||||
* Layout is (NDHWC, KZYXC, NDHWK).
|
||||
* Computes gradient with respect to input.
|
||||
*
|
||||
* \param N number of batches
|
||||
* \param K number of filters (output channels)
|
||||
* \param C number of input channels
|
||||
* \param (Di, Hi, Wi) depth, height and width dimension of input
|
||||
* \param (Z, Y, X) depth, height and width dimensions of filter
|
||||
* \param (Do, Ho, Wo) depth, height and width dimension of output
|
||||
* \param (stride_z, stride_y, stride_x) strides
|
||||
* \param (dilation_z, dilation_y, dilation_x) dilations
|
||||
* \param (pad_z, pad_y, pad_x) pads
|
||||
*/
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename TAcc,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation>
|
||||
__global__ void naive_conv_bwd_data_ndhwc_kzyxc_ndhwk(TIn* __restrict__ p_in_grad,
|
||||
const TWei* __restrict__ p_wei,
|
||||
const TOut* __restrict__ p_out_grad,
|
||||
const ConvDims dims)
|
||||
// Optimized backward data convolution kernel working with packed (contiguous) tensors
|
||||
// Computes gradients w.r.t. input from output gradients and weights
|
||||
// Assumes row-major packing: input[G][N][C][spatial], weight[G][K][C][filter],
|
||||
// output[G][N][K][spatial]
|
||||
template <index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InElementOp,
|
||||
typename WeiElementOp,
|
||||
typename OutElementOp>
|
||||
__global__ void naive_conv_bwd_data_packed(InDataType* __restrict__ p_in,
|
||||
const WeiDataType* __restrict__ p_wei,
|
||||
const OutDataType* __restrict__ p_out,
|
||||
index_t G,
|
||||
index_t N,
|
||||
index_t K,
|
||||
index_t C,
|
||||
index_t Di,
|
||||
index_t Hi,
|
||||
index_t Wi,
|
||||
index_t Z,
|
||||
index_t Y,
|
||||
index_t X,
|
||||
index_t Do,
|
||||
index_t Ho,
|
||||
index_t Wo,
|
||||
index_t stride_z,
|
||||
index_t stride_y,
|
||||
index_t stride_x,
|
||||
index_t dilation_z,
|
||||
index_t dilation_y,
|
||||
index_t dilation_x,
|
||||
index_t pad_z,
|
||||
index_t pad_y,
|
||||
index_t pad_x,
|
||||
InElementOp in_op,
|
||||
WeiElementOp wei_op,
|
||||
OutElementOp out_op)
|
||||
{
|
||||
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const index_t num_threads = blockDim.x * gridDim.x;
|
||||
const long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C;
|
||||
const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const long_index_t num_threads = blockDim.x * gridDim.x;
|
||||
|
||||
const index_t in_strides[] = {
|
||||
dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
|
||||
const index_t out_strides[] = {
|
||||
dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
|
||||
const index_t wei_strides[] = {
|
||||
dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
|
||||
InDataType in_val = InDataType{0};
|
||||
WeiDataType wei_val = WeiDataType{0};
|
||||
OutDataType out_val = OutDataType{0};
|
||||
|
||||
constexpr auto in_op = InElementwiseOperation{};
|
||||
constexpr auto wei_op = WeiElementwiseOperation{};
|
||||
constexpr auto out_op = OutElementwiseOperation{};
|
||||
|
||||
TIn in_val = TIn{0};
|
||||
TWei wei_val = TWei{0};
|
||||
TOut out_val = TOut{0};
|
||||
|
||||
for(long_index_t ii = tid; ii < input_length; ii += num_threads)
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
// Decode linear index to (n, di, hi, wi, c)
|
||||
const index_t n = ii / in_strides[0];
|
||||
index_t tmp = ii - n * in_strides[0];
|
||||
const index_t di = tmp / in_strides[1];
|
||||
tmp -= di * in_strides[1];
|
||||
const index_t hi = tmp / in_strides[2];
|
||||
tmp -= hi * in_strides[2];
|
||||
const index_t wi = tmp / in_strides[3];
|
||||
tmp -= wi * in_strides[3];
|
||||
const index_t c = tmp;
|
||||
const long_index_t num_in = G * N * C * Wi;
|
||||
const long_index_t out_stride_g = N * K * Wo;
|
||||
const long_index_t out_stride_n = K * Wo;
|
||||
const long_index_t out_stride_k = Wo;
|
||||
const long_index_t wei_stride_g = K * C * X;
|
||||
const long_index_t wei_stride_k = C * X;
|
||||
const long_index_t wei_stride_c = X;
|
||||
const long_index_t in_stride_g = N * C * Wi;
|
||||
const long_index_t in_stride_n = C * Wi;
|
||||
const long_index_t in_stride_c = Wi;
|
||||
|
||||
// Always accumulate in float
|
||||
float acc_float = 0.0f;
|
||||
|
||||
const TOut* out_n = p_out_grad + static_cast<long_index_t>(n) * out_strides[0];
|
||||
|
||||
// Loop over output channels
|
||||
for(index_t k = 0; k < dims.K; ++k)
|
||||
for(long_index_t idx = tid; idx < num_in; idx += num_threads)
|
||||
{
|
||||
const TWei* wei_k = p_wei + static_cast<long_index_t>(k) * wei_strides[0];
|
||||
index_t remaining = idx;
|
||||
const index_t wi = remaining % Wi;
|
||||
remaining /= Wi;
|
||||
const index_t c = remaining % C;
|
||||
remaining /= C;
|
||||
const index_t n = remaining % N;
|
||||
const index_t g = remaining / N;
|
||||
|
||||
// Loop over filter dimensions
|
||||
for(index_t z = 0; z < dims.Z; ++z)
|
||||
float acc = 0.0f;
|
||||
const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
|
||||
const WeiDataType* wei_g = p_wei + g * wei_stride_g;
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
// Calculate output position from input position (inverse of forward)
|
||||
index_t d_tmp = di + dims.pad_z - z * dims.dilation_z;
|
||||
if(d_tmp % dims.stride_z != 0)
|
||||
continue;
|
||||
index_t d_o = d_tmp / dims.stride_z;
|
||||
if(d_o < 0 || d_o >= dims.Do)
|
||||
continue;
|
||||
|
||||
const TOut* out_n_do = out_n + d_o * out_strides[1];
|
||||
const TWei* wei_k_z = wei_k + z * wei_strides[1];
|
||||
|
||||
for(index_t y = 0; y < dims.Y; ++y)
|
||||
long_index_t w_tmp = wi + pad_x - x * dilation_x;
|
||||
if(w_tmp % stride_x == 0)
|
||||
{
|
||||
index_t h_tmp = hi + dims.pad_y - y * dims.dilation_y;
|
||||
if(h_tmp % dims.stride_y != 0)
|
||||
continue;
|
||||
index_t ho = h_tmp / dims.stride_y;
|
||||
if(ho < 0 || ho >= dims.Ho)
|
||||
continue;
|
||||
|
||||
const TOut* out_n_do_ho = out_n_do + ho * out_strides[2];
|
||||
const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2];
|
||||
|
||||
for(index_t x = 0; x < dims.X; ++x)
|
||||
long_index_t wo = w_tmp / stride_x;
|
||||
if(wo >= 0 && wo < Wo)
|
||||
{
|
||||
index_t w_tmp = wi + dims.pad_x - x * dims.dilation_x;
|
||||
if(w_tmp % dims.stride_x != 0)
|
||||
continue;
|
||||
index_t wo = w_tmp / dims.stride_x;
|
||||
if(wo < 0 || wo >= dims.Wo)
|
||||
continue;
|
||||
const OutDataType* out_gnk = out_gn;
|
||||
const WeiDataType* wei_gkc = wei_g + c * wei_stride_c;
|
||||
|
||||
const TOut* out_n_do_ho_wo = out_n_do_ho + wo * out_strides[3];
|
||||
const TWei* wei_k_z_y_x = wei_k_z_y + x * wei_strides[3];
|
||||
|
||||
// Load values from memory
|
||||
TOut out_loaded = out_n_do_ho_wo[k];
|
||||
TWei wei_loaded = wei_k_z_y_x[c];
|
||||
|
||||
// Apply element-wise operations (like forward does)
|
||||
out_op(out_val, out_loaded);
|
||||
wei_op(wei_val, wei_loaded);
|
||||
|
||||
// Convert to float for multiplication
|
||||
float out_f = type_convert<float>(out_val);
|
||||
float wei_f = type_convert<float>(wei_val);
|
||||
|
||||
acc_float += out_f * wei_f;
|
||||
for(index_t k = 0; k < K; ++k)
|
||||
{
|
||||
out_op(out_val, out_gnk[k * out_stride_k + wo]);
|
||||
wei_op(wei_val, wei_gkc[k * wei_stride_k + x]);
|
||||
acc += type_convert<float>(out_val) * type_convert<float>(wei_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
InDataType result = type_convert<InDataType>(acc);
|
||||
in_op(in_val, result);
|
||||
p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + wi] = in_val;
|
||||
}
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
const long_index_t num_in = G * N * C * Hi * Wi;
|
||||
const long_index_t out_stride_g = N * K * Ho * Wo;
|
||||
const long_index_t out_stride_n = K * Ho * Wo;
|
||||
const long_index_t out_stride_k = Ho * Wo;
|
||||
const long_index_t out_stride_h = Wo;
|
||||
const long_index_t wei_stride_g = K * C * Y * X;
|
||||
const long_index_t wei_stride_k = C * Y * X;
|
||||
const long_index_t wei_stride_c = Y * X;
|
||||
const long_index_t wei_stride_y = X;
|
||||
const long_index_t in_stride_g = N * C * Hi * Wi;
|
||||
const long_index_t in_stride_n = C * Hi * Wi;
|
||||
const long_index_t in_stride_c = Hi * Wi;
|
||||
const long_index_t in_stride_h = Wi;
|
||||
|
||||
// Convert float accumulator to TAcc, then to input type
|
||||
TAcc acc = type_convert<TAcc>(acc_float);
|
||||
TIn result = type_convert<TIn>(acc);
|
||||
for(long_index_t idx = tid; idx < num_in; idx += num_threads)
|
||||
{
|
||||
index_t remaining = idx;
|
||||
const index_t wi = remaining % Wi;
|
||||
remaining /= Wi;
|
||||
const index_t hi = remaining % Hi;
|
||||
remaining /= Hi;
|
||||
const index_t c = remaining % C;
|
||||
remaining /= C;
|
||||
const index_t n = remaining % N;
|
||||
const index_t g = remaining / N;
|
||||
|
||||
// Apply input element-wise operation (if any)
|
||||
in_op(in_val, result);
|
||||
float acc = 0.0f;
|
||||
const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
|
||||
const WeiDataType* wei_g = p_wei + g * wei_stride_g;
|
||||
|
||||
// Write transformed result
|
||||
p_in_grad[ii] = in_val;
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
long_index_t h_tmp = hi + pad_y - y * dilation_y;
|
||||
if(h_tmp % stride_y == 0)
|
||||
{
|
||||
long_index_t ho = h_tmp / stride_y;
|
||||
if(ho >= 0 && ho < Ho)
|
||||
{
|
||||
const OutDataType* out_gnkh = out_gn + ho * out_stride_h;
|
||||
const WeiDataType* wei_gkcy = wei_g + c * wei_stride_c + y * wei_stride_y;
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
long_index_t w_tmp = wi + pad_x - x * dilation_x;
|
||||
if(w_tmp % stride_x == 0)
|
||||
{
|
||||
long_index_t wo = w_tmp / stride_x;
|
||||
if(wo >= 0 && wo < Wo)
|
||||
{
|
||||
for(index_t k = 0; k < K; ++k)
|
||||
{
|
||||
out_op(out_val, out_gnkh[k * out_stride_k + wo]);
|
||||
wei_op(wei_val, wei_gkcy[k * wei_stride_k + x]);
|
||||
acc += type_convert<float>(out_val) *
|
||||
type_convert<float>(wei_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
InDataType result = type_convert<InDataType>(acc);
|
||||
in_op(in_val, result);
|
||||
p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + hi * in_stride_h + wi] =
|
||||
in_val;
|
||||
}
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
const long_index_t num_in = G * N * C * Di * Hi * Wi;
|
||||
const long_index_t out_stride_g = N * K * Do * Ho * Wo;
|
||||
const long_index_t out_stride_n = K * Do * Ho * Wo;
|
||||
const long_index_t out_stride_k = Do * Ho * Wo;
|
||||
const long_index_t out_stride_d = Ho * Wo;
|
||||
const long_index_t out_stride_h = Wo;
|
||||
const long_index_t wei_stride_g = K * C * Z * Y * X;
|
||||
const long_index_t wei_stride_k = C * Z * Y * X;
|
||||
const long_index_t wei_stride_c = Z * Y * X;
|
||||
const long_index_t wei_stride_z = Y * X;
|
||||
const long_index_t wei_stride_y = X;
|
||||
const long_index_t in_stride_g = N * C * Di * Hi * Wi;
|
||||
const long_index_t in_stride_n = C * Di * Hi * Wi;
|
||||
const long_index_t in_stride_c = Di * Hi * Wi;
|
||||
const long_index_t in_stride_d = Hi * Wi;
|
||||
const long_index_t in_stride_h = Wi;
|
||||
|
||||
for(long_index_t idx = tid; idx < num_in; idx += num_threads)
|
||||
{
|
||||
index_t remaining = idx;
|
||||
const index_t wi = remaining % Wi;
|
||||
remaining /= Wi;
|
||||
const index_t hi = remaining % Hi;
|
||||
remaining /= Hi;
|
||||
const index_t di = remaining % Di;
|
||||
remaining /= Di;
|
||||
const index_t c = remaining % C;
|
||||
remaining /= C;
|
||||
const index_t n = remaining % N;
|
||||
const index_t g = remaining / N;
|
||||
|
||||
float acc = 0.0f;
|
||||
const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
|
||||
const WeiDataType* wei_g = p_wei + g * wei_stride_g;
|
||||
|
||||
for(index_t z = 0; z < Z; ++z)
|
||||
{
|
||||
long_index_t d_tmp = di + pad_z - z * dilation_z;
|
||||
if(d_tmp % stride_z == 0)
|
||||
{
|
||||
long_index_t do_idx = d_tmp / stride_z;
|
||||
if(do_idx >= 0 && do_idx < Do)
|
||||
{
|
||||
const OutDataType* out_gnkd = out_gn + do_idx * out_stride_d;
|
||||
const WeiDataType* wei_gkcz = wei_g + c * wei_stride_c + z * wei_stride_z;
|
||||
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
long_index_t h_tmp = hi + pad_y - y * dilation_y;
|
||||
if(h_tmp % stride_y == 0)
|
||||
{
|
||||
long_index_t ho = h_tmp / stride_y;
|
||||
if(ho >= 0 && ho < Ho)
|
||||
{
|
||||
const OutDataType* out_gnkdh = out_gnkd + ho * out_stride_h;
|
||||
const WeiDataType* wei_gkczy = wei_gkcz + y * wei_stride_y;
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
long_index_t w_tmp = wi + pad_x - x * dilation_x;
|
||||
if(w_tmp % stride_x == 0)
|
||||
{
|
||||
long_index_t wo = w_tmp / stride_x;
|
||||
if(wo >= 0 && wo < Wo)
|
||||
{
|
||||
for(index_t k = 0; k < K; ++k)
|
||||
{
|
||||
out_op(out_val,
|
||||
out_gnkdh[k * out_stride_k + wo]);
|
||||
wei_op(wei_val,
|
||||
wei_gkczy[k * wei_stride_k + x]);
|
||||
acc += type_convert<float>(out_val) *
|
||||
type_convert<float>(wei_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
InDataType result = type_convert<InDataType>(acc);
|
||||
in_op(in_val, result);
|
||||
p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + di * in_stride_d +
|
||||
hi * in_stride_h + wi] = in_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GPU reference backward data convolution - takes ConvParam directly
|
||||
template <typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation>
|
||||
void naive_conv_bwd_data(TIn* p_in,
|
||||
const TWei* p_wei,
|
||||
const TOut* p_out,
|
||||
const ck::utils::conv::ConvParam& conv_param,
|
||||
InElementwiseOperation in_element_op = InElementwiseOperation{},
|
||||
WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
|
||||
OutElementwiseOperation out_element_op = OutElementwiseOperation{},
|
||||
hipStream_t stream = nullptr)
|
||||
{
|
||||
const auto ndim = conv_param.num_dim_spatial_;
|
||||
|
||||
const index_t G = conv_param.G_;
|
||||
const index_t N = conv_param.N_;
|
||||
const index_t C = conv_param.C_;
|
||||
const index_t K = conv_param.K_;
|
||||
|
||||
std::vector<index_t> in_lengths = {G, N, C};
|
||||
std::vector<index_t> wei_lengths = {G, K, C};
|
||||
std::vector<index_t> out_lengths = {G, N, K};
|
||||
|
||||
for(index_t i = 0; i < ndim; ++i)
|
||||
{
|
||||
in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
|
||||
wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
|
||||
out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
|
||||
}
|
||||
|
||||
// Calculate total elements for buffer allocation
|
||||
long_index_t in_total = 1, wei_total = 1, out_total = 1;
|
||||
for(auto l : in_lengths)
|
||||
in_total *= l;
|
||||
for(auto l : wei_lengths)
|
||||
wei_total *= l;
|
||||
for(auto l : out_lengths)
|
||||
out_total *= l;
|
||||
|
||||
// Allocate packed buffers
|
||||
SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
|
||||
SimpleDeviceMem wei_packed_buf(wei_total * sizeof(TWei));
|
||||
SimpleDeviceMem out_packed_buf(out_total * sizeof(TOut));
|
||||
|
||||
TIn* p_in_packed = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
|
||||
TWei* p_wei_packed = static_cast<TWei*>(wei_packed_buf.GetDeviceBuffer());
|
||||
TOut* p_out_packed = static_cast<TOut*>(out_packed_buf.GetDeviceBuffer());
|
||||
|
||||
// Compute strides and allocate device arrays for pack/unpack
|
||||
std::vector<index_t> in_strides = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
|
||||
std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
|
||||
std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
|
||||
|
||||
const size_t dim_count = in_lengths.size();
|
||||
SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
|
||||
|
||||
index_t* d_in_lengths = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_in_strides = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
|
||||
index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
|
||||
index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
|
||||
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
|
||||
// Pack output and weight tensors to contiguous layout (inputs to bwd data)
|
||||
constexpr int block_size = 256;
|
||||
strided_copy_kernel<TOut, false>
|
||||
<<<(out_total + block_size - 1) / block_size, block_size, 0, stream>>>(
|
||||
p_out, p_out_packed, d_out_lengths, d_out_strides, dim_count, out_total);
|
||||
strided_copy_kernel<TWei, false>
|
||||
<<<(wei_total + block_size - 1) / block_size, block_size, 0, stream>>>(
|
||||
p_wei, p_wei_packed, d_wei_lengths, d_wei_strides, dim_count, wei_total);
|
||||
|
||||
// Build conv parameter vectors for kernel invocation
|
||||
std::vector<index_t> conv_strides(ndim);
|
||||
std::vector<index_t> conv_dilations(ndim);
|
||||
std::vector<index_t> input_pads(ndim);
|
||||
for(index_t i = 0; i < ndim; ++i)
|
||||
{
|
||||
conv_strides[i] = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
|
||||
conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
|
||||
input_pads[i] = static_cast<index_t>(conv_param.input_left_pads_[i]);
|
||||
}
|
||||
|
||||
// Run backward data convolution kernel on packed data
|
||||
const int in_grid = (in_total + block_size - 1) / block_size;
|
||||
|
||||
if(ndim == 1)
|
||||
{
|
||||
naive_conv_bwd_data_packed<1,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<in_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_packed,
|
||||
p_out_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
1,
|
||||
1,
|
||||
in_lengths[3],
|
||||
1,
|
||||
1,
|
||||
wei_lengths[3],
|
||||
1,
|
||||
1,
|
||||
out_lengths[3],
|
||||
1,
|
||||
1,
|
||||
conv_strides[0],
|
||||
1,
|
||||
1,
|
||||
conv_dilations[0],
|
||||
0,
|
||||
0,
|
||||
input_pads[0],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(ndim == 2)
|
||||
{
|
||||
naive_conv_bwd_data_packed<2,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<in_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_packed,
|
||||
p_out_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
1,
|
||||
in_lengths[3],
|
||||
in_lengths[4],
|
||||
1,
|
||||
wei_lengths[3],
|
||||
wei_lengths[4],
|
||||
1,
|
||||
out_lengths[3],
|
||||
out_lengths[4],
|
||||
1,
|
||||
conv_strides[0],
|
||||
conv_strides[1],
|
||||
1,
|
||||
conv_dilations[0],
|
||||
conv_dilations[1],
|
||||
0,
|
||||
input_pads[0],
|
||||
input_pads[1],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else // 3D
|
||||
{
|
||||
naive_conv_bwd_data_packed<3,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<in_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_packed,
|
||||
p_out_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_lengths[3],
|
||||
in_lengths[4],
|
||||
in_lengths[5],
|
||||
wei_lengths[3],
|
||||
wei_lengths[4],
|
||||
wei_lengths[5],
|
||||
out_lengths[3],
|
||||
out_lengths[4],
|
||||
out_lengths[5],
|
||||
conv_strides[0],
|
||||
conv_strides[1],
|
||||
conv_strides[2],
|
||||
conv_dilations[0],
|
||||
conv_dilations[1],
|
||||
conv_dilations[2],
|
||||
input_pads[0],
|
||||
input_pads[1],
|
||||
input_pads[2],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
// Unpack result back to strided layout
|
||||
strided_copy_kernel<TIn, true><<<in_grid, block_size, 0, stream>>>(
|
||||
p_in_packed, p_in, d_in_lengths, d_in_strides, dim_count, in_total);
|
||||
|
||||
HIP_CHECK_ERROR(hipGetLastError());
|
||||
|
||||
// Memory automatically freed by SimpleDeviceMem destructors
|
||||
}
|
||||
|
||||
} // namespace ref
|
||||
} // namespace ck
|
||||
|
||||
@@ -4,136 +4,497 @@
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/type_convert.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace ref {
|
||||
|
||||
/*
|
||||
* \brief naive implementation of 3D convolution backward weight.
|
||||
* Layout is (NDHWC, KZYXC, NDHWK).
|
||||
* Computes gradient with respect to weights.
|
||||
*
|
||||
* \param N number of batches
|
||||
* \param K number of filters (output channels)
|
||||
* \param C number of input channels
|
||||
* \param (Di, Hi, Wi) depth, height and width dimension of input
|
||||
* \param (Z, Y, X) depth, height and width dimensions of filter
|
||||
* \param (Do, Ho, Wo) depth, height and width dimension of output
|
||||
* \param (stride_z, stride_y, stride_x) strides
|
||||
* \param (dilation_z, dilation_y, dilation_x) dilations
|
||||
* \param (pad_z, pad_y, pad_x) pads
|
||||
*/
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename TAcc,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation>
|
||||
__global__ void naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in,
|
||||
TWei* __restrict__ p_wei_grad,
|
||||
const TOut* __restrict__ p_out_grad,
|
||||
const ConvDims dims)
|
||||
// Optimized backward weight convolution kernel working with packed (contiguous) tensors
|
||||
// Assumes row-major packing: input[G][N][C][spatial], output_grad[G][N][K][spatial],
|
||||
// weight_grad[G][K][C][filter]
|
||||
// Computes gradient with respect to weights
|
||||
template <index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InElementOp,
|
||||
typename WeiElementOp,
|
||||
typename OutElementOp>
|
||||
__global__ void naive_conv_bwd_weight_packed(const InDataType* __restrict__ p_in,
|
||||
WeiDataType* __restrict__ p_wei_grad,
|
||||
const OutDataType* __restrict__ p_out_grad,
|
||||
index_t G,
|
||||
index_t N,
|
||||
index_t K,
|
||||
index_t C,
|
||||
index_t Di,
|
||||
index_t Hi,
|
||||
index_t Wi,
|
||||
index_t Z,
|
||||
index_t Y,
|
||||
index_t X,
|
||||
index_t Do,
|
||||
index_t Ho,
|
||||
index_t Wo,
|
||||
index_t stride_z,
|
||||
index_t stride_y,
|
||||
index_t stride_x,
|
||||
index_t dilation_z,
|
||||
index_t dilation_y,
|
||||
index_t dilation_x,
|
||||
index_t pad_z,
|
||||
index_t pad_y,
|
||||
index_t pad_x,
|
||||
InElementOp in_op,
|
||||
WeiElementOp wei_op,
|
||||
OutElementOp out_op)
|
||||
{
|
||||
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const index_t num_threads = blockDim.x * gridDim.x;
|
||||
const long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C;
|
||||
const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const long_index_t num_threads = blockDim.x * gridDim.x;
|
||||
|
||||
const index_t in_strides[] = {
|
||||
dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
|
||||
const index_t out_strides[] = {
|
||||
dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
|
||||
const index_t wei_strides[] = {
|
||||
dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
|
||||
InDataType in_val = InDataType{0};
|
||||
WeiDataType wei_val = WeiDataType{0};
|
||||
OutDataType out_val = OutDataType{0};
|
||||
|
||||
constexpr auto in_op = InElementwiseOperation{};
|
||||
constexpr auto wei_op = WeiElementwiseOperation{};
|
||||
constexpr auto out_op = OutElementwiseOperation{};
|
||||
|
||||
TIn in_val = TIn{0};
|
||||
TWei wei_val = TWei{0};
|
||||
TOut out_val = TOut{0};
|
||||
|
||||
for(long_index_t ii = tid; ii < weight_length; ii += num_threads)
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
// Decode linear index to (k, z, y, x, c)
|
||||
const index_t k = ii / wei_strides[0];
|
||||
index_t tmp = ii - k * wei_strides[0];
|
||||
const index_t z = tmp / wei_strides[1];
|
||||
tmp -= z * wei_strides[1];
|
||||
const index_t y = tmp / wei_strides[2];
|
||||
tmp -= y * wei_strides[2];
|
||||
const index_t x = tmp / wei_strides[3];
|
||||
tmp -= x * wei_strides[3];
|
||||
const index_t c = tmp;
|
||||
const long_index_t num_wei = G * K * C * X;
|
||||
const long_index_t in_stride_g = N * C * Wi;
|
||||
const long_index_t in_stride_n = C * Wi;
|
||||
const long_index_t in_stride_c = Wi;
|
||||
const long_index_t out_stride_g = N * K * Wo;
|
||||
const long_index_t out_stride_n = K * Wo;
|
||||
const long_index_t out_stride_k = Wo;
|
||||
const long_index_t wei_stride_g = K * C * X;
|
||||
const long_index_t wei_stride_k = C * X;
|
||||
const long_index_t wei_stride_c = X;
|
||||
|
||||
// Always accumulate in float
|
||||
float acc_float = 0.0f;
|
||||
|
||||
// Loop over batch
|
||||
for(index_t n = 0; n < dims.N; ++n)
|
||||
for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
|
||||
{
|
||||
const TIn* in_n = p_in + static_cast<long_index_t>(n) * in_strides[0];
|
||||
const TOut* out_n = p_out_grad + static_cast<long_index_t>(n) * out_strides[0];
|
||||
index_t remaining = idx;
|
||||
const index_t x = remaining % X;
|
||||
remaining /= X;
|
||||
const index_t c = remaining % C;
|
||||
remaining /= C;
|
||||
const index_t k = remaining % K;
|
||||
const index_t g = remaining / K;
|
||||
|
||||
// Loop over output spatial dimensions
|
||||
for(index_t d_o = 0; d_o < dims.Do; ++d_o)
|
||||
float acc = 0.0f;
|
||||
const InDataType* in_g = p_in + g * in_stride_g;
|
||||
const OutDataType* out_grad = p_out_grad + g * out_stride_g;
|
||||
|
||||
// Loop over batch and output positions
|
||||
for(index_t n = 0; n < N; ++n)
|
||||
{
|
||||
// Calculate input position from output position
|
||||
index_t di = d_o * dims.stride_z - dims.pad_z + z * dims.dilation_z;
|
||||
if(di < 0 || di >= dims.Di)
|
||||
continue;
|
||||
const InDataType* in_gn = in_g + n * in_stride_n + c * in_stride_c;
|
||||
const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
|
||||
|
||||
const TIn* in_n_di = in_n + di * in_strides[1];
|
||||
const TOut* out_n_do = out_n + d_o * out_strides[1];
|
||||
|
||||
for(index_t ho = 0; ho < dims.Ho; ++ho)
|
||||
for(index_t wo = 0; wo < Wo; ++wo)
|
||||
{
|
||||
index_t hi = ho * dims.stride_y - dims.pad_y + y * dims.dilation_y;
|
||||
if(hi < 0 || hi >= dims.Hi)
|
||||
continue;
|
||||
|
||||
const TIn* in_n_di_hi = in_n_di + hi * in_strides[2];
|
||||
const TOut* out_n_do_ho = out_n_do + ho * out_strides[2];
|
||||
|
||||
for(index_t wo = 0; wo < dims.Wo; ++wo)
|
||||
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
|
||||
if(wi >= 0 && wi < Wi)
|
||||
{
|
||||
index_t wi = wo * dims.stride_x - dims.pad_x + x * dims.dilation_x;
|
||||
if(wi < 0 || wi >= dims.Wi)
|
||||
continue;
|
||||
|
||||
// Load values from memory (like forward does)
|
||||
const TIn* in_ptr = in_n_di_hi + wi * in_strides[3];
|
||||
const TOut* out_ptr = out_n_do_ho + wo * out_strides[3];
|
||||
|
||||
TIn in_loaded = in_ptr[c];
|
||||
TOut out_loaded = out_ptr[k];
|
||||
|
||||
// Apply element-wise operations
|
||||
in_op(in_val, in_loaded);
|
||||
out_op(out_val, out_loaded);
|
||||
|
||||
// Convert to float for multiplication
|
||||
float in_f = type_convert<float>(in_val);
|
||||
float out_f = type_convert<float>(out_val);
|
||||
|
||||
acc_float += out_f * in_f;
|
||||
in_op(in_val, in_gn[wi]);
|
||||
out_op(out_val, out_gn_k[wo]);
|
||||
acc += type_convert<float>(out_val) * type_convert<float>(in_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WeiDataType result = type_convert<WeiDataType>(acc);
|
||||
wei_op(wei_val, result);
|
||||
p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + x] = wei_val;
|
||||
}
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
const long_index_t num_wei = G * K * C * Y * X;
|
||||
const long_index_t in_stride_g = N * C * Hi * Wi;
|
||||
const long_index_t in_stride_n = C * Hi * Wi;
|
||||
const long_index_t in_stride_c = Hi * Wi;
|
||||
const long_index_t in_stride_h = Wi;
|
||||
const long_index_t out_stride_g = N * K * Ho * Wo;
|
||||
const long_index_t out_stride_n = K * Ho * Wo;
|
||||
const long_index_t out_stride_k = Ho * Wo;
|
||||
const long_index_t out_stride_h = Wo;
|
||||
const long_index_t wei_stride_g = K * C * Y * X;
|
||||
const long_index_t wei_stride_k = C * Y * X;
|
||||
const long_index_t wei_stride_c = Y * X;
|
||||
const long_index_t wei_stride_y = X;
|
||||
|
||||
// Convert float accumulator to TAcc, then to weight type
|
||||
TAcc acc = type_convert<TAcc>(acc_float);
|
||||
TWei result = type_convert<TWei>(acc);
|
||||
for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
|
||||
{
|
||||
index_t remaining = idx;
|
||||
const index_t x = remaining % X;
|
||||
remaining /= X;
|
||||
const index_t y = remaining % Y;
|
||||
remaining /= Y;
|
||||
const index_t c = remaining % C;
|
||||
remaining /= C;
|
||||
const index_t k = remaining % K;
|
||||
const index_t g = remaining / K;
|
||||
|
||||
// Apply weight element-wise operation (if any)
|
||||
wei_op(wei_val, result);
|
||||
float acc = 0.0f;
|
||||
const InDataType* in_g = p_in + g * in_stride_g;
|
||||
const OutDataType* out_grad = p_out_grad + g * out_stride_g;
|
||||
|
||||
// Write transformed result
|
||||
p_wei_grad[ii] = wei_val;
|
||||
// Loop over batch and output positions
|
||||
for(index_t n = 0; n < N; ++n)
|
||||
{
|
||||
const InDataType* in_gnc = in_g + n * in_stride_n + c * in_stride_c;
|
||||
const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
|
||||
|
||||
for(index_t ho = 0; ho < Ho; ++ho)
|
||||
{
|
||||
long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
|
||||
if(hi >= 0 && hi < Hi)
|
||||
{
|
||||
const InDataType* in_gnch = in_gnc + hi * in_stride_h;
|
||||
const OutDataType* out_gn_kh = out_gn_k + ho * out_stride_h;
|
||||
|
||||
for(index_t wo = 0; wo < Wo; ++wo)
|
||||
{
|
||||
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
|
||||
if(wi >= 0 && wi < Wi)
|
||||
{
|
||||
in_op(in_val, in_gnch[wi]);
|
||||
out_op(out_val, out_gn_kh[wo]);
|
||||
acc += type_convert<float>(out_val) * type_convert<float>(in_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WeiDataType result = type_convert<WeiDataType>(acc);
|
||||
wei_op(wei_val, result);
|
||||
p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + y * wei_stride_y +
|
||||
x] = wei_val;
|
||||
}
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
const long_index_t num_wei = G * K * C * Z * Y * X;
|
||||
const long_index_t in_stride_g = N * C * Di * Hi * Wi;
|
||||
const long_index_t in_stride_n = C * Di * Hi * Wi;
|
||||
const long_index_t in_stride_c = Di * Hi * Wi;
|
||||
const long_index_t in_stride_d = Hi * Wi;
|
||||
const long_index_t in_stride_h = Wi;
|
||||
const long_index_t out_stride_g = N * K * Do * Ho * Wo;
|
||||
const long_index_t out_stride_n = K * Do * Ho * Wo;
|
||||
const long_index_t out_stride_k = Do * Ho * Wo;
|
||||
const long_index_t out_stride_d = Ho * Wo;
|
||||
const long_index_t out_stride_h = Wo;
|
||||
const long_index_t wei_stride_g = K * C * Z * Y * X;
|
||||
const long_index_t wei_stride_k = C * Z * Y * X;
|
||||
const long_index_t wei_stride_c = Z * Y * X;
|
||||
const long_index_t wei_stride_z = Y * X;
|
||||
const long_index_t wei_stride_y = X;
|
||||
|
||||
for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
|
||||
{
|
||||
index_t remaining = idx;
|
||||
const index_t x = remaining % X;
|
||||
remaining /= X;
|
||||
const index_t y = remaining % Y;
|
||||
remaining /= Y;
|
||||
const index_t z = remaining % Z;
|
||||
remaining /= Z;
|
||||
const index_t c = remaining % C;
|
||||
remaining /= C;
|
||||
const index_t k = remaining % K;
|
||||
const index_t g = remaining / K;
|
||||
|
||||
float acc = 0.0f;
|
||||
const InDataType* in_g = p_in + g * in_stride_g;
|
||||
const OutDataType* out_grad = p_out_grad + g * out_stride_g;
|
||||
|
||||
// Loop over batch and output positions
|
||||
for(index_t n = 0; n < N; ++n)
|
||||
{
|
||||
const InDataType* in_gnc = in_g + n * in_stride_n + c * in_stride_c;
|
||||
const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
|
||||
|
||||
for(index_t do_idx = 0; do_idx < Do; ++do_idx)
|
||||
{
|
||||
long_index_t di = do_idx * stride_z + z * dilation_z - pad_z;
|
||||
if(di >= 0 && di < Di)
|
||||
{
|
||||
const InDataType* in_gncd = in_gnc + di * in_stride_d;
|
||||
const OutDataType* out_gn_kd = out_gn_k + do_idx * out_stride_d;
|
||||
|
||||
for(index_t ho = 0; ho < Ho; ++ho)
|
||||
{
|
||||
long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
|
||||
if(hi >= 0 && hi < Hi)
|
||||
{
|
||||
const InDataType* in_gncdh = in_gncd + hi * in_stride_h;
|
||||
const OutDataType* out_gn_kdh = out_gn_kd + ho * out_stride_h;
|
||||
|
||||
for(index_t wo = 0; wo < Wo; ++wo)
|
||||
{
|
||||
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
|
||||
if(wi >= 0 && wi < Wi)
|
||||
{
|
||||
in_op(in_val, in_gncdh[wi]);
|
||||
out_op(out_val, out_gn_kdh[wo]);
|
||||
acc += type_convert<float>(out_val) *
|
||||
type_convert<float>(in_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WeiDataType result = type_convert<WeiDataType>(acc);
|
||||
wei_op(wei_val, result);
|
||||
p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + z * wei_stride_z +
|
||||
y * wei_stride_y + x] = wei_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GPU reference backward weight convolution - takes ConvParam directly
|
||||
template <typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation>
|
||||
void naive_conv_bwd_weight(const TIn* p_in,
|
||||
TWei* p_wei_grad,
|
||||
const TOut* p_out,
|
||||
const ck::utils::conv::ConvParam& conv_param,
|
||||
InElementwiseOperation in_element_op = InElementwiseOperation{},
|
||||
WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
|
||||
OutElementwiseOperation out_element_op = OutElementwiseOperation{},
|
||||
hipStream_t stream = nullptr)
|
||||
{
|
||||
const auto ndim = conv_param.num_dim_spatial_;
|
||||
|
||||
const index_t G = conv_param.G_;
|
||||
const index_t N = conv_param.N_;
|
||||
const index_t C = conv_param.C_;
|
||||
const index_t K = conv_param.K_;
|
||||
|
||||
std::vector<index_t> in_lengths = {G, N, C};
|
||||
std::vector<index_t> wei_lengths = {G, K, C};
|
||||
std::vector<index_t> out_lengths = {G, N, K};
|
||||
|
||||
for(index_t i = 0; i < ndim; ++i)
|
||||
{
|
||||
in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
|
||||
wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
|
||||
out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
|
||||
}
|
||||
|
||||
// Calculate total elements for buffer allocation
|
||||
long_index_t in_total = 1, wei_total = 1, out_total = 1;
|
||||
for(auto l : in_lengths)
|
||||
in_total *= l;
|
||||
for(auto l : wei_lengths)
|
||||
wei_total *= l;
|
||||
for(auto l : out_lengths)
|
||||
out_total *= l;
|
||||
|
||||
// Allocate packed buffers
|
||||
SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
|
||||
SimpleDeviceMem wei_grad_packed_buf(wei_total * sizeof(TWei));
|
||||
SimpleDeviceMem out_grad_packed_buf(out_total * sizeof(TOut));
|
||||
|
||||
TIn* p_in_packed = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
|
||||
TWei* p_wei_grad_packed = static_cast<TWei*>(wei_grad_packed_buf.GetDeviceBuffer());
|
||||
TOut* p_out_grad_packed = static_cast<TOut*>(out_grad_packed_buf.GetDeviceBuffer());
|
||||
|
||||
// Compute strides and allocate device arrays for pack/unpack
|
||||
std::vector<index_t> in_strides = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
|
||||
std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
|
||||
std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
|
||||
|
||||
const size_t dim_count = in_lengths.size();
|
||||
SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
|
||||
|
||||
index_t* d_in_lengths = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_in_strides = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
|
||||
index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
|
||||
index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
|
||||
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
|
||||
// Pack input and output_grad tensors to contiguous layout (inputs to bwd weight)
|
||||
constexpr int block_size = 256;
|
||||
strided_copy_kernel<TIn, false>
|
||||
<<<(in_total + block_size - 1) / block_size, block_size, 0, stream>>>(
|
||||
p_in, p_in_packed, d_in_lengths, d_in_strides, dim_count, in_total);
|
||||
strided_copy_kernel<TOut, false>
|
||||
<<<(out_total + block_size - 1) / block_size, block_size, 0, stream>>>(
|
||||
p_out, p_out_grad_packed, d_out_lengths, d_out_strides, dim_count, out_total);
|
||||
|
||||
// Build conv parameter vectors for kernel invocation
|
||||
std::vector<index_t> conv_strides(ndim);
|
||||
std::vector<index_t> conv_dilations(ndim);
|
||||
std::vector<index_t> input_pads(ndim);
|
||||
for(index_t i = 0; i < ndim; ++i)
|
||||
{
|
||||
conv_strides[i] = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
|
||||
conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
|
||||
input_pads[i] = static_cast<index_t>(conv_param.input_left_pads_[i]);
|
||||
}
|
||||
|
||||
// Run backward weight convolution kernel on packed data
|
||||
const int wei_grid = (wei_total + block_size - 1) / block_size;
|
||||
|
||||
if(ndim == 1)
|
||||
{
|
||||
naive_conv_bwd_weight_packed<1,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<wei_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_grad_packed,
|
||||
p_out_grad_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
1,
|
||||
1,
|
||||
in_lengths[3],
|
||||
1,
|
||||
1,
|
||||
wei_lengths[3],
|
||||
1,
|
||||
1,
|
||||
out_lengths[3],
|
||||
1,
|
||||
1,
|
||||
conv_strides[0],
|
||||
1,
|
||||
1,
|
||||
conv_dilations[0],
|
||||
0,
|
||||
0,
|
||||
input_pads[0],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(ndim == 2)
|
||||
{
|
||||
naive_conv_bwd_weight_packed<2,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<wei_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_grad_packed,
|
||||
p_out_grad_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
1,
|
||||
in_lengths[3],
|
||||
in_lengths[4],
|
||||
1,
|
||||
wei_lengths[3],
|
||||
wei_lengths[4],
|
||||
1,
|
||||
out_lengths[3],
|
||||
out_lengths[4],
|
||||
1,
|
||||
conv_strides[0],
|
||||
conv_strides[1],
|
||||
1,
|
||||
conv_dilations[0],
|
||||
conv_dilations[1],
|
||||
0,
|
||||
input_pads[0],
|
||||
input_pads[1],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else // 3D
|
||||
{
|
||||
naive_conv_bwd_weight_packed<3,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<wei_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_grad_packed,
|
||||
p_out_grad_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_lengths[3],
|
||||
in_lengths[4],
|
||||
in_lengths[5],
|
||||
wei_lengths[3],
|
||||
wei_lengths[4],
|
||||
wei_lengths[5],
|
||||
out_lengths[3],
|
||||
out_lengths[4],
|
||||
out_lengths[5],
|
||||
conv_strides[0],
|
||||
conv_strides[1],
|
||||
conv_strides[2],
|
||||
conv_dilations[0],
|
||||
conv_dilations[1],
|
||||
conv_dilations[2],
|
||||
input_pads[0],
|
||||
input_pads[1],
|
||||
input_pads[2],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
// Unpack weight gradient
|
||||
strided_copy_kernel<TWei, true><<<wei_grid, block_size, 0, stream>>>(
|
||||
p_wei_grad_packed, p_wei_grad, d_wei_lengths, d_wei_strides, dim_count, wei_total);
|
||||
|
||||
HIP_CHECK_ERROR(hipGetLastError());
|
||||
|
||||
// Memory automatically freed by SimpleDeviceMem destructors
|
||||
}
|
||||
|
||||
} // namespace ref
|
||||
} // namespace ck
|
||||
|
||||
@@ -4,126 +4,493 @@
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/type_convert.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace ref {
|
||||
|
||||
/*
|
||||
* \brief naive implementation of 3D convolution. Layout is (NDHWC, KZYXC, NDHWK).
|
||||
*
|
||||
* \param N number of batches
|
||||
* \param K number of filters
|
||||
* \param C number of channels of weight
|
||||
* \param (Di, Hi, Wi) depth, height and width dimension of data
|
||||
* \param (Z, Y, X) depth, height and width dimensions of weights
|
||||
* \param (Do, Ho, Wo) depth, height and width dimension of output
|
||||
* \param (stride_z, stride_y, stride_x) strides
|
||||
* \param (dilation_z, dilation_y, dilation_x) dilations
|
||||
* \param (pad_z, pad_y, pad_x) pads
|
||||
*/
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename TAcc,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation>
|
||||
__global__ void naive_conv_fwd_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in,
|
||||
const TWei* __restrict__ p_wei,
|
||||
TOut* __restrict__ p_out,
|
||||
const ConvDims dims)
|
||||
// Optimized convolution kernel working with packed (contiguous) tensors
|
||||
// Assumes row-major packing: input[G][N][C][spatial], weight[G][K][C][filter],
|
||||
// output[G][N][K][spatial]
|
||||
template <index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InElementOp,
|
||||
typename WeiElementOp,
|
||||
typename OutElementOp>
|
||||
__global__ void naive_conv_fwd_packed(const InDataType* __restrict__ p_in,
|
||||
const WeiDataType* __restrict__ p_wei,
|
||||
OutDataType* __restrict__ p_out,
|
||||
index_t G,
|
||||
index_t N,
|
||||
index_t K,
|
||||
index_t C,
|
||||
index_t Di,
|
||||
index_t Hi,
|
||||
index_t Wi,
|
||||
index_t Z,
|
||||
index_t Y,
|
||||
index_t X,
|
||||
index_t Do,
|
||||
index_t Ho,
|
||||
index_t Wo,
|
||||
index_t stride_z,
|
||||
index_t stride_y,
|
||||
index_t stride_x,
|
||||
index_t dilation_z,
|
||||
index_t dilation_y,
|
||||
index_t dilation_x,
|
||||
index_t pad_z,
|
||||
index_t pad_y,
|
||||
index_t pad_x,
|
||||
InElementOp in_op,
|
||||
WeiElementOp wei_op,
|
||||
OutElementOp out_op)
|
||||
{
|
||||
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const index_t num_threads = blockDim.x * gridDim.x;
|
||||
const long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K;
|
||||
const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const long_index_t num_threads = blockDim.x * gridDim.x;
|
||||
|
||||
const index_t out_strides[] = {
|
||||
dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
|
||||
const index_t in_strides[] = {
|
||||
dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
|
||||
const index_t wei_strides[] = {
|
||||
dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
|
||||
InDataType in_val = InDataType{0};
|
||||
WeiDataType wei_val = WeiDataType{0};
|
||||
OutDataType out_val = OutDataType{0};
|
||||
|
||||
constexpr auto in_op = InElementwiseOperation{};
|
||||
constexpr auto wei_op = WeiElementwiseOperation{};
|
||||
constexpr auto out_op = OutElementwiseOperation{};
|
||||
|
||||
TIn in_val = TIn{0};
|
||||
TWei wei_val = TWei{0};
|
||||
TOut out_val = TOut{0};
|
||||
|
||||
for(long_index_t ii = tid; ii < output_length; ii += num_threads)
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
const index_t n = ii / out_strides[0];
|
||||
index_t k = ii - n * out_strides[0];
|
||||
const index_t dO = k / out_strides[1];
|
||||
k -= dO * out_strides[1];
|
||||
const index_t ho = k / out_strides[2];
|
||||
k -= ho * out_strides[2];
|
||||
const index_t wo = k / out_strides[3];
|
||||
k -= wo * out_strides[3];
|
||||
const long_index_t num_out = G * N * K * Wo;
|
||||
const long_index_t in_stride_g = N * C * Wi;
|
||||
const long_index_t in_stride_n = C * Wi;
|
||||
const long_index_t in_stride_c = Wi;
|
||||
const long_index_t wei_stride_g = K * C * X;
|
||||
const long_index_t wei_stride_k = C * X;
|
||||
const long_index_t wei_stride_c = X;
|
||||
const long_index_t out_stride_g = N * K * Wo;
|
||||
const long_index_t out_stride_n = K * Wo;
|
||||
const long_index_t out_stride_k = Wo;
|
||||
|
||||
// Always accumulate in float (FP8/BF8 don't support arithmetic)
|
||||
float acc_float = 0.0f;
|
||||
|
||||
const TIn* in_n = p_in + static_cast<long_index_t>(n) * in_strides[0];
|
||||
const TWei* wei_k = p_wei + static_cast<long_index_t>(k) * wei_strides[0];
|
||||
|
||||
for(index_t z = 0; z < dims.Z; ++z)
|
||||
for(long_index_t idx = tid; idx < num_out; idx += num_threads)
|
||||
{
|
||||
index_t di = dims.stride_z * dO - dims.pad_z + dims.dilation_z * z;
|
||||
const TIn* in_n_di = in_n + di * in_strides[1];
|
||||
const TWei* wei_k_z = wei_k + z * wei_strides[1];
|
||||
index_t remaining = idx;
|
||||
const index_t wo = remaining % Wo;
|
||||
remaining /= Wo;
|
||||
const index_t k = remaining % K;
|
||||
remaining /= K;
|
||||
const index_t n = remaining % N;
|
||||
const index_t g = remaining / N;
|
||||
|
||||
for(index_t y = 0; y < dims.Y; ++y)
|
||||
float acc = 0.0f;
|
||||
const InDataType* in_g = p_in + g * in_stride_g + n * in_stride_n;
|
||||
const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
|
||||
|
||||
for(index_t c = 0; c < C; ++c)
|
||||
{
|
||||
index_t hi = dims.stride_y * ho - dims.pad_y + dims.dilation_y * y;
|
||||
const TIn* in_n_di_hi = in_n_di + hi * in_strides[2];
|
||||
const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2];
|
||||
const InDataType* in_gc = in_g + c * in_stride_c;
|
||||
const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
|
||||
|
||||
for(index_t x = 0; x < dims.X; ++x)
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
index_t wi = dims.stride_x * wo - dims.pad_x + dims.dilation_x * x;
|
||||
const TIn* in_n_di_hi_wi = in_n_di_hi + wi * in_strides[3];
|
||||
const TWei* wei_k_z_y_x = wei_k_z_y + x * wei_strides[3];
|
||||
|
||||
if(di >= 0 && di < dims.Di && hi >= 0 && hi < dims.Hi && wi >= 0 &&
|
||||
wi < dims.Wi)
|
||||
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
|
||||
if(wi >= 0 && wi < Wi)
|
||||
{
|
||||
for(index_t c = 0; c < dims.C; ++c)
|
||||
in_op(in_val, in_gc[wi]);
|
||||
wei_op(wei_val, wei_gkc[x]);
|
||||
acc += type_convert<float>(in_val) * type_convert<float>(wei_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OutDataType result = type_convert<OutDataType>(acc);
|
||||
out_op(out_val, result);
|
||||
p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + wo] = out_val;
|
||||
}
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
const long_index_t num_out = G * N * K * Ho * Wo;
|
||||
const long_index_t in_stride_g = N * C * Hi * Wi;
|
||||
const long_index_t in_stride_n = C * Hi * Wi;
|
||||
const long_index_t in_stride_c = Hi * Wi;
|
||||
const long_index_t in_stride_h = Wi;
|
||||
const long_index_t wei_stride_g = K * C * Y * X;
|
||||
const long_index_t wei_stride_k = C * Y * X;
|
||||
const long_index_t wei_stride_c = Y * X;
|
||||
const long_index_t wei_stride_y = X;
|
||||
const long_index_t out_stride_g = N * K * Ho * Wo;
|
||||
const long_index_t out_stride_n = K * Ho * Wo;
|
||||
const long_index_t out_stride_k = Ho * Wo;
|
||||
const long_index_t out_stride_h = Wo;
|
||||
|
||||
for(long_index_t idx = tid; idx < num_out; idx += num_threads)
|
||||
{
|
||||
index_t remaining = idx;
|
||||
const index_t wo = remaining % Wo;
|
||||
remaining /= Wo;
|
||||
const index_t ho = remaining % Ho;
|
||||
remaining /= Ho;
|
||||
const index_t k = remaining % K;
|
||||
remaining /= K;
|
||||
const index_t n = remaining % N;
|
||||
const index_t g = remaining / N;
|
||||
|
||||
float acc = 0.0f;
|
||||
const InDataType* in_gn = p_in + g * in_stride_g + n * in_stride_n;
|
||||
const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
|
||||
|
||||
for(index_t c = 0; c < C; ++c)
|
||||
{
|
||||
const InDataType* in_gnc = in_gn + c * in_stride_c;
|
||||
const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
|
||||
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
|
||||
if(hi >= 0 && hi < Hi)
|
||||
{
|
||||
const InDataType* in_gnch = in_gnc + hi * in_stride_h;
|
||||
const WeiDataType* wei_gkcy = wei_gkc + y * wei_stride_y;
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
// Load values from memory
|
||||
TIn in_loaded = in_n_di_hi_wi[c];
|
||||
TWei wei_loaded = wei_k_z_y_x[c];
|
||||
|
||||
// Apply element-wise operations
|
||||
in_op(in_val, in_loaded);
|
||||
wei_op(wei_val, wei_loaded);
|
||||
|
||||
// Always convert to float for multiplication (FP8/BF8 don't support
|
||||
// direct arithmetic)
|
||||
float in_f = type_convert<float>(in_val);
|
||||
float wei_f = type_convert<float>(wei_val);
|
||||
|
||||
// Accumulate in float
|
||||
acc_float += in_f * wei_f;
|
||||
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
|
||||
if(wi >= 0 && wi < Wi)
|
||||
{
|
||||
in_op(in_val, in_gnch[wi]);
|
||||
wei_op(wei_val, wei_gkcy[x]);
|
||||
acc += type_convert<float>(in_val) * type_convert<float>(wei_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OutDataType result = type_convert<OutDataType>(acc);
|
||||
out_op(out_val, result);
|
||||
p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + ho * out_stride_h + wo] =
|
||||
out_val;
|
||||
}
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
const long_index_t num_out = G * N * K * Do * Ho * Wo;
|
||||
const long_index_t in_stride_g = N * C * Di * Hi * Wi;
|
||||
const long_index_t in_stride_n = C * Di * Hi * Wi;
|
||||
const long_index_t in_stride_c = Di * Hi * Wi;
|
||||
const long_index_t in_stride_d = Hi * Wi;
|
||||
const long_index_t in_stride_h = Wi;
|
||||
const long_index_t wei_stride_g = K * C * Z * Y * X;
|
||||
const long_index_t wei_stride_k = C * Z * Y * X;
|
||||
const long_index_t wei_stride_c = Z * Y * X;
|
||||
const long_index_t wei_stride_z = Y * X;
|
||||
const long_index_t wei_stride_y = X;
|
||||
const long_index_t out_stride_g = N * K * Do * Ho * Wo;
|
||||
const long_index_t out_stride_n = K * Do * Ho * Wo;
|
||||
const long_index_t out_stride_k = Do * Ho * Wo;
|
||||
const long_index_t out_stride_d = Ho * Wo;
|
||||
const long_index_t out_stride_h = Wo;
|
||||
|
||||
// Convert float accumulator to TAcc, then to output type
|
||||
TAcc acc = type_convert<TAcc>(acc_float);
|
||||
TOut result = type_convert<TOut>(acc);
|
||||
for(long_index_t idx = tid; idx < num_out; idx += num_threads)
|
||||
{
|
||||
index_t remaining = idx;
|
||||
const index_t wo = remaining % Wo;
|
||||
remaining /= Wo;
|
||||
const index_t ho = remaining % Ho;
|
||||
remaining /= Ho;
|
||||
const index_t do_idx = remaining % Do;
|
||||
remaining /= Do;
|
||||
const index_t k = remaining % K;
|
||||
remaining /= K;
|
||||
const index_t n = remaining % N;
|
||||
const index_t g = remaining / N;
|
||||
|
||||
// Apply output element-wise operation (if any)
|
||||
out_op(out_val, result);
|
||||
float acc = 0.0f;
|
||||
const InDataType* in_gn = p_in + g * in_stride_g + n * in_stride_n;
|
||||
const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
|
||||
|
||||
// Write transformed result
|
||||
p_out[ii] = out_val;
|
||||
for(index_t c = 0; c < C; ++c)
|
||||
{
|
||||
const InDataType* in_gnc = in_gn + c * in_stride_c;
|
||||
const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
|
||||
|
||||
for(index_t z = 0; z < Z; ++z)
|
||||
{
|
||||
long_index_t di = do_idx * stride_z + z * dilation_z - pad_z;
|
||||
if(di >= 0 && di < Di)
|
||||
{
|
||||
const InDataType* in_gncd = in_gnc + di * in_stride_d;
|
||||
const WeiDataType* wei_gkcz = wei_gkc + z * wei_stride_z;
|
||||
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
|
||||
if(hi >= 0 && hi < Hi)
|
||||
{
|
||||
const InDataType* in_gncdh = in_gncd + hi * in_stride_h;
|
||||
const WeiDataType* wei_gkczy = wei_gkcz + y * wei_stride_y;
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
|
||||
if(wi >= 0 && wi < Wi)
|
||||
{
|
||||
in_op(in_val, in_gncdh[wi]);
|
||||
wei_op(wei_val, wei_gkczy[x]);
|
||||
acc += type_convert<float>(in_val) *
|
||||
type_convert<float>(wei_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OutDataType result = type_convert<OutDataType>(acc);
|
||||
out_op(out_val, result);
|
||||
p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + do_idx * out_stride_d +
|
||||
ho * out_stride_h + wo] = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GPU reference convolution - takes ConvParam directly
|
||||
template <typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation>
|
||||
void naive_conv_fwd(const TIn* p_in,
|
||||
const TWei* p_wei,
|
||||
TOut* p_out,
|
||||
const ck::utils::conv::ConvParam& conv_param,
|
||||
InElementwiseOperation in_element_op = InElementwiseOperation{},
|
||||
WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
|
||||
OutElementwiseOperation out_element_op = OutElementwiseOperation{},
|
||||
hipStream_t stream = nullptr)
|
||||
{
|
||||
const auto ndim = conv_param.num_dim_spatial_;
|
||||
|
||||
const index_t G = conv_param.G_;
|
||||
const index_t N = conv_param.N_;
|
||||
const index_t C = conv_param.C_;
|
||||
const index_t K = conv_param.K_;
|
||||
|
||||
std::vector<index_t> in_lengths = {G, N, C};
|
||||
std::vector<index_t> wei_lengths = {G, K, C};
|
||||
std::vector<index_t> out_lengths = {G, N, K};
|
||||
|
||||
for(index_t i = 0; i < ndim; ++i)
|
||||
{
|
||||
in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
|
||||
wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
|
||||
out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
|
||||
}
|
||||
|
||||
// Calculate total elements for buffer allocation
|
||||
long_index_t in_total = 1, wei_total = 1, out_total = 1;
|
||||
for(auto l : in_lengths)
|
||||
in_total *= l;
|
||||
for(auto l : wei_lengths)
|
||||
wei_total *= l;
|
||||
for(auto l : out_lengths)
|
||||
out_total *= l;
|
||||
|
||||
// Allocate packed buffers
|
||||
SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
|
||||
SimpleDeviceMem wei_packed_buf(wei_total * sizeof(TWei));
|
||||
SimpleDeviceMem out_packed_buf(out_total * sizeof(TOut));
|
||||
|
||||
TIn* p_in_packed = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
|
||||
TWei* p_wei_packed = static_cast<TWei*>(wei_packed_buf.GetDeviceBuffer());
|
||||
TOut* p_out_packed = static_cast<TOut*>(out_packed_buf.GetDeviceBuffer());
|
||||
|
||||
// Compute strides and allocate device arrays for pack/unpack
|
||||
std::vector<index_t> in_strides = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
|
||||
std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
|
||||
std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
|
||||
|
||||
const size_t dim_count = in_lengths.size();
|
||||
SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
|
||||
SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
|
||||
|
||||
index_t* d_in_lengths = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_in_strides = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
|
||||
index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
|
||||
index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
|
||||
index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
|
||||
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK_ERROR(hipMemcpy(
|
||||
d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
|
||||
|
||||
// Pack input and weight tensors to contiguous layout
|
||||
constexpr int block_size = 256;
|
||||
strided_copy_kernel<TIn, false>
|
||||
<<<(in_total + block_size - 1) / block_size, block_size, 0, stream>>>(
|
||||
p_in, p_in_packed, d_in_lengths, d_in_strides, dim_count, in_total);
|
||||
strided_copy_kernel<TWei, false>
|
||||
<<<(wei_total + block_size - 1) / block_size, block_size, 0, stream>>>(
|
||||
p_wei, p_wei_packed, d_wei_lengths, d_wei_strides, dim_count, wei_total);
|
||||
|
||||
// Build conv parameter vectors for kernel invocation
|
||||
std::vector<index_t> conv_strides(ndim);
|
||||
std::vector<index_t> conv_dilations(ndim);
|
||||
std::vector<index_t> input_pads(ndim);
|
||||
for(index_t i = 0; i < ndim; ++i)
|
||||
{
|
||||
conv_strides[i] = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
|
||||
conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
|
||||
input_pads[i] = static_cast<index_t>(conv_param.input_left_pads_[i]);
|
||||
}
|
||||
|
||||
// Run convolution kernel on packed data
|
||||
const int out_grid = (out_total + block_size - 1) / block_size;
|
||||
|
||||
if(ndim == 1)
|
||||
{
|
||||
naive_conv_fwd_packed<1,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<out_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_packed,
|
||||
p_out_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
1,
|
||||
1,
|
||||
in_lengths[3],
|
||||
1,
|
||||
1,
|
||||
wei_lengths[3],
|
||||
1,
|
||||
1,
|
||||
out_lengths[3],
|
||||
1,
|
||||
1,
|
||||
conv_strides[0],
|
||||
1,
|
||||
1,
|
||||
conv_dilations[0],
|
||||
0,
|
||||
0,
|
||||
input_pads[0],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(ndim == 2)
|
||||
{
|
||||
naive_conv_fwd_packed<2,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<out_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_packed,
|
||||
p_out_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
1,
|
||||
in_lengths[3],
|
||||
in_lengths[4],
|
||||
1,
|
||||
wei_lengths[3],
|
||||
wei_lengths[4],
|
||||
1,
|
||||
out_lengths[3],
|
||||
out_lengths[4],
|
||||
1,
|
||||
conv_strides[0],
|
||||
conv_strides[1],
|
||||
1,
|
||||
conv_dilations[0],
|
||||
conv_dilations[1],
|
||||
0,
|
||||
input_pads[0],
|
||||
input_pads[1],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else // 3D
|
||||
{
|
||||
naive_conv_fwd_packed<3,
|
||||
TIn,
|
||||
TWei,
|
||||
TOut,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
<<<out_grid, block_size, 0, stream>>>(p_in_packed,
|
||||
p_wei_packed,
|
||||
p_out_packed,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_lengths[3],
|
||||
in_lengths[4],
|
||||
in_lengths[5],
|
||||
wei_lengths[3],
|
||||
wei_lengths[4],
|
||||
wei_lengths[5],
|
||||
out_lengths[3],
|
||||
out_lengths[4],
|
||||
out_lengths[5],
|
||||
conv_strides[0],
|
||||
conv_strides[1],
|
||||
conv_strides[2],
|
||||
conv_dilations[0],
|
||||
conv_dilations[1],
|
||||
conv_dilations[2],
|
||||
input_pads[0],
|
||||
input_pads[1],
|
||||
input_pads[2],
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
// Unpack
|
||||
strided_copy_kernel<TOut, true><<<out_grid, block_size, 0, stream>>>(
|
||||
p_out_packed, p_out, d_out_lengths, d_out_strides, dim_count, out_total);
|
||||
|
||||
HIP_CHECK_ERROR(hipGetLastError());
|
||||
|
||||
// Memory automatically freed by SimpleDeviceMem destructors
|
||||
}
|
||||
|
||||
} // namespace ref
|
||||
} // namespace ck
|
||||
|
||||
@@ -0,0 +1,177 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <vector>
|
||||
|
||||
namespace ck {
|
||||
namespace ref {
|
||||
|
||||
// RAII wrapper for device memory to prevent leaks
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&p_mem_), mem_size));
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
// Helper function to map layout dimension character to index in lengths array
|
||||
// lengths array structure: [G, N/K, C/K, spatial...]
|
||||
inline int map_dim_char_to_index(char dim_char, index_t ndim_spatial, bool is_weight)
|
||||
{
|
||||
// G dimension
|
||||
if(dim_char == 'G')
|
||||
return 0;
|
||||
|
||||
// Batch/output channels dimension (N for input/output, K for weight's first non-G dim)
|
||||
if(dim_char == 'N')
|
||||
return 1;
|
||||
if(dim_char == 'K' && is_weight)
|
||||
return 1;
|
||||
|
||||
// Channel dimension (C for input/weight, K for output)
|
||||
if(dim_char == 'C')
|
||||
return 2;
|
||||
if(dim_char == 'K' && !is_weight)
|
||||
return 2;
|
||||
|
||||
// Spatial dimensions - map based on ndim_spatial
|
||||
// Input/Output use: D/H/W, Weight uses: Z/Y/X
|
||||
if(ndim_spatial == 1)
|
||||
{
|
||||
if(dim_char == 'W' || dim_char == 'X')
|
||||
return 3;
|
||||
}
|
||||
else if(ndim_spatial == 2)
|
||||
{
|
||||
if(dim_char == 'H' || dim_char == 'Y')
|
||||
return 3;
|
||||
if(dim_char == 'W' || dim_char == 'X')
|
||||
return 4;
|
||||
}
|
||||
else if(ndim_spatial == 3)
|
||||
{
|
||||
if(dim_char == 'D' || dim_char == 'Z')
|
||||
return 3;
|
||||
if(dim_char == 'H' || dim_char == 'Y')
|
||||
return 4;
|
||||
if(dim_char == 'W' || dim_char == 'X')
|
||||
return 5;
|
||||
}
|
||||
|
||||
// Should not reach here
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Template function to compute layout-aware strides based on layout name
|
||||
// The layout name directly encodes memory ordering from left to right
|
||||
template <typename Layout>
|
||||
inline std::vector<index_t> compute_conv_tensor_strides(const std::vector<index_t>& lengths,
|
||||
index_t ndim_spatial)
|
||||
{
|
||||
constexpr const char* layout_name = Layout::name;
|
||||
const int num_dims = static_cast<int>(lengths.size());
|
||||
std::vector<index_t> strides(num_dims, 0);
|
||||
|
||||
// Determine if this is a weight tensor (has 'K' but not 'N')
|
||||
bool has_k = false;
|
||||
bool has_n = false;
|
||||
for(const char* p = layout_name; *p != '\0'; ++p)
|
||||
{
|
||||
if(*p == 'K')
|
||||
has_k = true;
|
||||
if(*p == 'N')
|
||||
has_n = true;
|
||||
}
|
||||
bool is_weight = has_k && !has_n;
|
||||
|
||||
// Build dimension ordering from layout name (parse string)
|
||||
std::vector<char> dim_order;
|
||||
const char dim_chars[] = {'G', 'N', 'K', 'C', 'D', 'H', 'W', 'X', 'Y', 'Z'};
|
||||
for(const char* p = layout_name; *p != '\0'; ++p)
|
||||
{
|
||||
char c = *p;
|
||||
// Skip underscores (strided layouts)
|
||||
if(c == '_')
|
||||
continue;
|
||||
// Valid dimension characters
|
||||
if(std::find(std::begin(dim_chars), std::end(dim_chars), c) != std::end(dim_chars))
|
||||
{
|
||||
dim_order.push_back(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Compute strides: process from right to left (innermost to outermost)
|
||||
index_t stride = 1;
|
||||
for(int i = static_cast<int>(dim_order.size()) - 1; i >= 0; --i)
|
||||
{
|
||||
char dim_char = dim_order[i];
|
||||
int length_idx = map_dim_char_to_index(dim_char, ndim_spatial, is_weight);
|
||||
|
||||
if(length_idx >= 0 && length_idx < num_dims)
|
||||
{
|
||||
strides[length_idx] = stride;
|
||||
stride *= lengths[length_idx];
|
||||
}
|
||||
}
|
||||
|
||||
return strides;
|
||||
}
|
||||
|
||||
// Unified kernel for strided tensor copy operations
|
||||
// IsUnpack=false: Pack strided -> contiguous
|
||||
// IsUnpack=true: Unpack contiguous -> strided
|
||||
template <typename DataType, bool IsUnpack>
|
||||
__global__ void strided_copy_kernel(const DataType* __restrict__ src,
|
||||
DataType* __restrict__ dst,
|
||||
const index_t* tensor_lengths,
|
||||
const index_t* strided_strides,
|
||||
int num_dims,
|
||||
long_index_t total_elements)
|
||||
{
|
||||
const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const long_index_t num_threads = blockDim.x * gridDim.x;
|
||||
|
||||
for(long_index_t linear_idx = tid; linear_idx < total_elements; linear_idx += num_threads)
|
||||
{
|
||||
// Compute strided index from linear index
|
||||
long_index_t remaining = linear_idx;
|
||||
long_index_t strided_idx = 0;
|
||||
|
||||
for(int dim = num_dims - 1; dim >= 0; --dim)
|
||||
{
|
||||
index_t coord = remaining % tensor_lengths[dim];
|
||||
remaining /= tensor_lengths[dim];
|
||||
strided_idx += coord * strided_strides[dim];
|
||||
}
|
||||
|
||||
// Direction determines which is src and which is dst
|
||||
if constexpr(IsUnpack)
|
||||
{
|
||||
// Unpack: src is contiguous (linear_idx), dst is strided (strided_idx)
|
||||
dst[strided_idx] = src[linear_idx];
|
||||
}
|
||||
else
|
||||
{
|
||||
// Pack: src is strided (strided_idx), dst is contiguous (linear_idx)
|
||||
dst[linear_idx] = src[strided_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ref
|
||||
} // namespace ck
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -89,8 +90,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
wei_device_buf.ToDevice(wei.mData.data());
|
||||
|
||||
float max_accumulated_value = 0;
|
||||
if(do_verification)
|
||||
if(do_verification == 2)
|
||||
{
|
||||
// Use GPU reference for verification
|
||||
std::cout << "Using GPU reference for verification" << std::endl;
|
||||
|
||||
// Allocate GPU reference output buffer
|
||||
DeviceMem gpu_ref_in_buf(sizeof(InDataType) * in_host.mDesc.GetElementSpaceSize());
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ref::naive_conv_bwd_data<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<InDataType*>(gpu_ref_in_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
conv_param,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
// Copy GPU reference result to host for comparison
|
||||
gpu_ref_in_buf.FromDevice(in_host.mData.data());
|
||||
max_accumulated_value = *std::max_element(in_host.mData.begin(), in_host.mData.end());
|
||||
}
|
||||
else if(do_verification == 1)
|
||||
{
|
||||
// Use CPU reference for verification (default)
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
@@ -93,29 +94,69 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
float max_accumulated_value = 0;
|
||||
if(do_verification)
|
||||
{
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weight_host_result,
|
||||
output,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op,
|
||||
{},
|
||||
{},
|
||||
{});
|
||||
if(do_verification == 1)
|
||||
{
|
||||
// CPU reference
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(input,
|
||||
weight_host_result,
|
||||
output,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op,
|
||||
{},
|
||||
{},
|
||||
{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
else if(do_verification == 2)
|
||||
{
|
||||
// GPU reference
|
||||
std::cout << "Running GPU reference implementation..." << std::endl;
|
||||
|
||||
// Allocate device memory for reference
|
||||
DeviceMem in_ref_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
|
||||
DeviceMem wei_ref_buf(sizeof(WeiDataType) *
|
||||
weight_host_result.mDesc.GetElementSpaceSize());
|
||||
DeviceMem out_ref_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
|
||||
|
||||
in_ref_buf.ToDevice(input.mData.data());
|
||||
out_ref_buf.ToDevice(output.mData.data());
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ck::ref::naive_conv_bwd_weight<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
static_cast<const InDataType*>(in_ref_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_ref_buf.GetDeviceBuffer()),
|
||||
static_cast<const OutDataType*>(out_ref_buf.GetDeviceBuffer()),
|
||||
conv_param,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
// Copy result back to host
|
||||
wei_ref_buf.FromDevice(weight_host_result.mData.data());
|
||||
}
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
max_accumulated_value =
|
||||
*std::max_element(weight_host_result.mData.begin(), weight_host_result.mData.end());
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
@@ -113,8 +114,38 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
wei_device_buf.ToDevice(weight.mData.data());
|
||||
|
||||
// run reference op
|
||||
if(do_verification)
|
||||
if(do_verification == 2)
|
||||
{
|
||||
// Use GPU reference for verification
|
||||
std::cout << "Using GPU reference for verification" << std::endl;
|
||||
|
||||
// Allocate GPU reference output buffer
|
||||
DeviceMem gpu_ref_out_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ref::naive_conv_fwd<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<OutDataType*>(gpu_ref_out_buf.GetDeviceBuffer()),
|
||||
conv_param,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
// Copy GPU reference result to host for comparison
|
||||
gpu_ref_out_buf.FromDevice(host_output.mData.data());
|
||||
}
|
||||
else if(do_verification == 1)
|
||||
{
|
||||
// Use CPU reference for verification (default)
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
|
||||
@@ -311,4 +311,5 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx12")
|
||||
endif()
|
||||
add_subdirectory(position_embedding)
|
||||
add_subdirectory(scatter_gather)
|
||||
add_subdirectory(gpu_reference)
|
||||
add_subdirectory(util)
|
||||
|
||||
@@ -5,3 +5,8 @@ add_gtest_executable(test_convnd_fwd convnd_fwd_xdl.cpp)
|
||||
if(result EQUAL 0)
|
||||
target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
|
||||
endif()
|
||||
|
||||
add_gtest_executable(test_convnd_fwd_naive convnd_fwd_naive.cpp)
|
||||
if(result EQUAL 0)
|
||||
target_link_libraries(test_convnd_fwd_naive PRIVATE utility)
|
||||
endif()
|
||||
|
||||
220
test/convnd_fwd/convnd_fwd_naive.cpp
Normal file
220
test/convnd_fwd/convnd_fwd_naive.cpp
Normal file
@@ -0,0 +1,220 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
using InDataType = float;
|
||||
using WeiDataType = float;
|
||||
using OutDataType = float;
|
||||
using AccDataType = float;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using DeviceConvNaive = ck::tensor_operation::device::
|
||||
DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
bool run_conv3d_naive_test(const ck::utils::conv::ConvParam& conv_param)
|
||||
{
|
||||
using namespace ck;
|
||||
using namespace ck::tensor_operation::host;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNCDHW;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKCZYX;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNKDHW;
|
||||
|
||||
const auto in_g_n_c_wis_desc =
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
|
||||
const auto wei_g_k_c_xs_desc =
|
||||
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
|
||||
const auto out_g_n_k_wos_desc =
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
|
||||
|
||||
Tensor<InDataType> in(in_g_n_c_wis_desc);
|
||||
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
|
||||
Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
|
||||
Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
|
||||
|
||||
// Initialize tensors
|
||||
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
|
||||
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
|
||||
|
||||
in_device_buf.ToDevice(in.mData.data());
|
||||
wei_device_buf.ToDevice(wei.mData.data());
|
||||
|
||||
// Run device kernel - convert long_index_t vectors to index_t
|
||||
std::vector<ck::index_t> input_spatial_lengths(conv_param.input_spatial_lengths_.begin(),
|
||||
conv_param.input_spatial_lengths_.end());
|
||||
std::vector<ck::index_t> filter_spatial_lengths(conv_param.filter_spatial_lengths_.begin(),
|
||||
conv_param.filter_spatial_lengths_.end());
|
||||
auto output_spatial_lengths_long = conv_param.GetOutputSpatialLengths();
|
||||
std::vector<ck::index_t> output_spatial_lengths(output_spatial_lengths_long.begin(),
|
||||
output_spatial_lengths_long.end());
|
||||
std::vector<ck::index_t> conv_filter_strides(conv_param.conv_filter_strides_.begin(),
|
||||
conv_param.conv_filter_strides_.end());
|
||||
std::vector<ck::index_t> conv_filter_dilations(conv_param.conv_filter_dilations_.begin(),
|
||||
conv_param.conv_filter_dilations_.end());
|
||||
std::vector<ck::index_t> input_left_pads(conv_param.input_left_pads_.begin(),
|
||||
conv_param.input_left_pads_.end());
|
||||
std::vector<ck::index_t> input_right_pads(conv_param.input_right_pads_.begin(),
|
||||
conv_param.input_right_pads_.end());
|
||||
|
||||
auto conv = DeviceConvNaive{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument =
|
||||
conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
std::cout << "Unsupported argument for naive conv3d kernel" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
invoker.Run(argument, StreamConfig{nullptr, false});
|
||||
|
||||
// Run CPU reference
|
||||
auto ref_conv = ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
AccDataType>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in,
|
||||
wei,
|
||||
out_host,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
// Compare results
|
||||
out_device_buf.FromDevice(out_device.mData.data());
|
||||
|
||||
return ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-3, 1e-3);
|
||||
}
|
||||
|
||||
TEST(TestConv3dNaive, Conv3dNaive_Small)
|
||||
{
|
||||
// Small 3D convolution test
|
||||
ck::utils::conv::ConvParam param{
|
||||
3, // spatial_dim
|
||||
1, // G
|
||||
2, // N
|
||||
16, // K
|
||||
16, // C
|
||||
{3, 3, 3}, // filter
|
||||
{7, 7, 7}, // input spatial
|
||||
{2, 2, 2}, // strides
|
||||
{1, 1, 1}, // dilations
|
||||
{1, 1, 1}, // left pads
|
||||
{1, 1, 1} // right pads
|
||||
};
|
||||
|
||||
bool pass = run_conv3d_naive_test<3>(param);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(TestConv3dNaive, Conv3dNaive_Medium)
|
||||
{
|
||||
// Medium size 3D convolution test
|
||||
ck::utils::conv::ConvParam param{
|
||||
3, // spatial_dim
|
||||
1, // G
|
||||
4, // N
|
||||
32, // K
|
||||
32, // C
|
||||
{3, 3, 3}, // filter
|
||||
{14, 14, 14}, // input spatial
|
||||
{1, 1, 1}, // strides
|
||||
{1, 1, 1}, // dilations
|
||||
{1, 1, 1}, // left pads
|
||||
{1, 1, 1} // right pads
|
||||
};
|
||||
|
||||
bool pass = run_conv3d_naive_test<3>(param);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
TEST(TestConv3dNaive, Conv3dNaive_UnitFilter)
|
||||
{
|
||||
// 1x1x1 filter (no padding)
|
||||
ck::utils::conv::ConvParam param{
|
||||
3, // spatial_dim
|
||||
1, // G
|
||||
2, // N
|
||||
24, // K
|
||||
24, // C
|
||||
{1, 1, 1}, // filter
|
||||
{8, 8, 8}, // input spatial
|
||||
{1, 1, 1}, // strides
|
||||
{1, 1, 1}, // dilations
|
||||
{0, 0, 0}, // left pads
|
||||
{0, 0, 0} // right pads
|
||||
};
|
||||
|
||||
bool pass = run_conv3d_naive_test<3>(param);
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
11
test/gpu_reference/CMakeLists.txt
Normal file
11
test/gpu_reference/CMakeLists.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
add_gtest_executable(test_gpu_reference_conv_fwd test_gpu_reference_conv_fwd.cpp)
|
||||
target_link_libraries(test_gpu_reference_conv_fwd PRIVATE utility)
|
||||
|
||||
add_gtest_executable(test_gpu_reference_conv_bwd_data test_gpu_reference_conv_bwd_data.cpp)
|
||||
target_link_libraries(test_gpu_reference_conv_bwd_data PRIVATE utility)
|
||||
|
||||
add_gtest_executable(test_gpu_reference_conv_bwd_weight test_gpu_reference_conv_bwd_weight.cpp)
|
||||
target_link_libraries(test_gpu_reference_conv_bwd_weight PRIVATE utility)
|
||||
137
test/gpu_reference/common_test_params.hpp
Normal file
137
test/gpu_reference/common_test_params.hpp
Normal file
@@ -0,0 +1,137 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace ck {
|
||||
namespace test {
|
||||
|
||||
// Common test shapes for all convolution tests (fwd, bwd_data, bwd_weight)
|
||||
namespace conv_test_shapes {
|
||||
|
||||
// 2D Conv, FP16, Small
|
||||
inline ck::utils::conv::ConvParam get_2d_small()
|
||||
{
|
||||
return ck::utils::conv::ConvParam(2, // num_dim_spatial
|
||||
1, // G
|
||||
2, // N
|
||||
8, // K
|
||||
8, // C
|
||||
{3, 3}, // filter_spatial
|
||||
{7, 7}, // input_spatial
|
||||
{1, 1}, // strides
|
||||
{1, 1}, // dilations
|
||||
{0, 0}, // left_pads
|
||||
{0, 0} // right_pads
|
||||
);
|
||||
}
|
||||
|
||||
// 2D Conv, FP32, Medium
|
||||
inline ck::utils::conv::ConvParam get_2d_medium()
|
||||
{
|
||||
return ck::utils::conv::ConvParam(2, // num_dim_spatial
|
||||
1, // G
|
||||
4, // N
|
||||
16, // K
|
||||
16, // C
|
||||
{3, 3}, // filter_spatial
|
||||
{14, 14}, // input_spatial
|
||||
{1, 1}, // strides
|
||||
{1, 1}, // dilations
|
||||
{0, 0}, // left_pads
|
||||
{0, 0} // right_pads
|
||||
);
|
||||
}
|
||||
|
||||
// 1D Conv, FP16
|
||||
inline ck::utils::conv::ConvParam get_1d()
|
||||
{
|
||||
return ck::utils::conv::ConvParam(1, // num_dim_spatial
|
||||
1, // G
|
||||
2, // N
|
||||
8, // K
|
||||
8, // C
|
||||
{3}, // filter_spatial
|
||||
{16}, // input_spatial
|
||||
{1}, // strides
|
||||
{1}, // dilations
|
||||
{0}, // left_pads
|
||||
{0} // right_pads
|
||||
);
|
||||
}
|
||||
|
||||
// 3D Conv, FP16, Small
|
||||
inline ck::utils::conv::ConvParam get_3d_small()
|
||||
{
|
||||
return ck::utils::conv::ConvParam(3, // num_dim_spatial
|
||||
1, // G
|
||||
1, // N
|
||||
8, // K
|
||||
8, // C
|
||||
{3, 3, 3}, // filter_spatial
|
||||
{5, 5, 5}, // input_spatial
|
||||
{1, 1, 1}, // strides
|
||||
{1, 1, 1}, // dilations
|
||||
{0, 0, 0}, // left_pads
|
||||
{0, 0, 0} // right_pads
|
||||
);
|
||||
}
|
||||
|
||||
// 2D Conv with stride
|
||||
inline ck::utils::conv::ConvParam get_2d_stride2()
|
||||
{
|
||||
return ck::utils::conv::ConvParam(2, // num_dim_spatial
|
||||
1, // G
|
||||
2, // N
|
||||
8, // K
|
||||
8, // C
|
||||
{3, 3}, // filter_spatial
|
||||
{8, 8}, // input_spatial
|
||||
{2, 2}, // strides
|
||||
{1, 1}, // dilations
|
||||
{0, 0}, // left_pads
|
||||
{0, 0} // right_pads
|
||||
);
|
||||
}
|
||||
|
||||
// 2D Grouped Conv, FP16, G=2
|
||||
inline ck::utils::conv::ConvParam get_2d_grouped_g2()
|
||||
{
|
||||
return ck::utils::conv::ConvParam(2, // num_dim_spatial
|
||||
2, // G
|
||||
2, // N
|
||||
8, // K (8 total output channels)
|
||||
16, // C (16 total input channels, 8 per group with G=2)
|
||||
{3, 3}, // filter_spatial
|
||||
{7, 7}, // input_spatial
|
||||
{1, 1}, // strides
|
||||
{1, 1}, // dilations
|
||||
{0, 0}, // left_pads
|
||||
{0, 0} // right_pads
|
||||
);
|
||||
}
|
||||
|
||||
// 2D Grouped Conv, FP32, G=4
|
||||
inline ck::utils::conv::ConvParam get_2d_grouped_g4()
|
||||
{
|
||||
return ck::utils::conv::ConvParam(2, // num_dim_spatial
|
||||
4, // G
|
||||
1, // N
|
||||
16, // K (16 total output channels)
|
||||
16, // C (16 total input channels, 4 per group with G=4)
|
||||
{3, 3}, // filter_spatial
|
||||
{8, 8}, // input_spatial
|
||||
{1, 1}, // strides
|
||||
{1, 1}, // dilations
|
||||
{0, 0}, // left_pads
|
||||
{0, 0} // right_pads
|
||||
);
|
||||
}
|
||||
|
||||
} // namespace conv_test_shapes
|
||||
} // namespace test
|
||||
} // namespace ck
|
||||
385
test/gpu_reference/gpu_reference_utils.hpp
Normal file
385
test/gpu_reference/gpu_reference_utils.hpp
Normal file
@@ -0,0 +1,385 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
// CPU references
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
|
||||
|
||||
// GPU references
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
#include "common_test_params.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace test {
|
||||
|
||||
enum class ConvKernelType
|
||||
{
|
||||
Forward,
|
||||
BackwardData,
|
||||
BackwardWeight
|
||||
};
|
||||
|
||||
// Helper function to initialize and copy a tensor to device
|
||||
template <typename DataType>
|
||||
void initialize_and_copy_tensor(Tensor<DataType>& host_tensor, DeviceMem& device_mem)
|
||||
{
|
||||
host_tensor.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
|
||||
device_mem.ToDevice(host_tensor.mData.data());
|
||||
}
|
||||
|
||||
// Helper to get default layout types based on NDimSpatial
|
||||
template <index_t NDimSpatial>
|
||||
struct DefaultConvLayouts
|
||||
{
|
||||
using InLayout = std::conditional_t<NDimSpatial == 3,
|
||||
tensor_layout::convolution::GNCDHW,
|
||||
std::conditional_t<NDimSpatial == 2,
|
||||
tensor_layout::convolution::GNCHW,
|
||||
tensor_layout::convolution::GNCW>>;
|
||||
using WeiLayout = std::conditional_t<NDimSpatial == 3,
|
||||
tensor_layout::convolution::GKCZYX,
|
||||
std::conditional_t<NDimSpatial == 2,
|
||||
tensor_layout::convolution::GKCYX,
|
||||
tensor_layout::convolution::GKCX>>;
|
||||
using OutLayout = std::conditional_t<NDimSpatial == 3,
|
||||
tensor_layout::convolution::GNKDHW,
|
||||
std::conditional_t<NDimSpatial == 2,
|
||||
tensor_layout::convolution::GNKHW,
|
||||
tensor_layout::convolution::GNKW>>;
|
||||
};
|
||||
|
||||
// Forward convolution implementation
|
||||
template <index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout>
|
||||
bool test_conv_fwd_impl(const ck::utils::conv::ConvParam& params,
|
||||
const Tensor<InDataType>& input_cpu,
|
||||
const Tensor<WeiDataType>& weight_cpu,
|
||||
DeviceMem& input_dev,
|
||||
DeviceMem& weight_dev,
|
||||
DeviceMem& output_dev)
|
||||
{
|
||||
using InElementOp = tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ref::naive_conv_fwd<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
|
||||
params);
|
||||
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
// Run CPU reference
|
||||
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
|
||||
params.conv_filter_strides_.end());
|
||||
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
|
||||
params.conv_filter_dilations_.end());
|
||||
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
|
||||
params.input_left_pads_.end());
|
||||
|
||||
Tensor<InDataType> input_ref = input_cpu;
|
||||
Tensor<WeiDataType> weight_ref = weight_cpu;
|
||||
Tensor<OutDataType> output_ref(
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params));
|
||||
|
||||
auto ref_conv = tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>();
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_arg = ref_conv.MakeArgument(input_ref,
|
||||
weight_ref,
|
||||
output_ref,
|
||||
strides_long,
|
||||
dilations_long,
|
||||
pads_long,
|
||||
pads_long,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
ref_invoker.Run(ref_arg);
|
||||
|
||||
// Copy result from device and compare
|
||||
Tensor<OutDataType> output_gpu(output_ref.mDesc);
|
||||
output_dev.FromDevice(output_gpu.mData.data());
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
// Compare results
|
||||
return ck::utils::check_err(output_gpu, output_ref);
|
||||
}
|
||||
|
||||
// Backward data convolution implementation
|
||||
template <index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout>
|
||||
bool test_conv_bwd_data_impl(const ck::utils::conv::ConvParam& params,
|
||||
const Tensor<WeiDataType>& weight_cpu,
|
||||
const Tensor<OutDataType>& output_cpu,
|
||||
DeviceMem& weight_dev,
|
||||
DeviceMem& output_dev,
|
||||
DeviceMem& input_dev)
|
||||
{
|
||||
using InElementOp = tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ref::naive_conv_bwd_data<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<InDataType*>(input_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const OutDataType*>(output_dev.GetDeviceBuffer()),
|
||||
params);
|
||||
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
// Run CPU reference
|
||||
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
|
||||
params.conv_filter_strides_.end());
|
||||
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
|
||||
params.conv_filter_dilations_.end());
|
||||
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
|
||||
params.input_left_pads_.end());
|
||||
|
||||
Tensor<InDataType> input_ref(
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(params));
|
||||
Tensor<WeiDataType> weight_ref = weight_cpu;
|
||||
Tensor<OutDataType> output_ref = output_cpu;
|
||||
|
||||
auto ref_conv = tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>();
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_arg = ref_conv.MakeArgument(input_ref,
|
||||
weight_ref,
|
||||
output_ref,
|
||||
strides_long,
|
||||
dilations_long,
|
||||
pads_long,
|
||||
pads_long,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
ref_invoker.Run(ref_arg);
|
||||
|
||||
// Copy result from device and compare
|
||||
Tensor<InDataType> input_gpu(input_ref.mDesc);
|
||||
input_dev.FromDevice(input_gpu.mData.data());
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
// Compare results
|
||||
return ck::utils::check_err(input_gpu, input_ref);
|
||||
}
|
||||
|
||||
// Backward weight convolution implementation
|
||||
template <index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout>
|
||||
bool test_conv_bwd_weight_impl(const ck::utils::conv::ConvParam& params,
|
||||
const Tensor<InDataType>& input_cpu,
|
||||
const Tensor<OutDataType>& output_cpu,
|
||||
DeviceMem& input_dev,
|
||||
DeviceMem& output_dev,
|
||||
DeviceMem& weight_dev)
|
||||
{
|
||||
using InElementOp = tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
// Call GPU reference with ConvParam directly
|
||||
ref::naive_conv_bwd_weight<InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>(
|
||||
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<WeiDataType*>(weight_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const OutDataType*>(output_dev.GetDeviceBuffer()),
|
||||
params);
|
||||
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
// Run CPU reference
|
||||
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
|
||||
params.conv_filter_strides_.end());
|
||||
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
|
||||
params.conv_filter_dilations_.end());
|
||||
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
|
||||
params.input_left_pads_.end());
|
||||
|
||||
Tensor<InDataType> input_ref = input_cpu;
|
||||
Tensor<WeiDataType> weight_ref(
|
||||
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(params));
|
||||
Tensor<OutDataType> output_ref = output_cpu;
|
||||
|
||||
auto ref_conv = tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp>();
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_arg = ref_conv.MakeArgument(input_ref,
|
||||
weight_ref,
|
||||
output_ref,
|
||||
strides_long,
|
||||
dilations_long,
|
||||
pads_long,
|
||||
pads_long,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
ref_invoker.Run(ref_arg);
|
||||
|
||||
// Copy result from device and compare
|
||||
Tensor<WeiDataType> weight_gpu(weight_ref.mDesc);
|
||||
weight_dev.FromDevice(weight_gpu.mData.data());
|
||||
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
||||
|
||||
// Compare results
|
||||
return ck::utils::check_err(weight_gpu, weight_ref);
|
||||
}
|
||||
|
||||
// Main test function - dispatches to specific implementations
|
||||
template <index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InLayout = typename DefaultConvLayouts<NDimSpatial>::InLayout,
|
||||
typename WeiLayout = typename DefaultConvLayouts<NDimSpatial>::WeiLayout,
|
||||
typename OutLayout = typename DefaultConvLayouts<NDimSpatial>::OutLayout>
|
||||
bool test_conv_gpu_ref(const ck::utils::conv::ConvParam& params, ConvKernelType kernel_type)
|
||||
{
|
||||
// Create tensor descriptors using the specified layouts
|
||||
const auto in_g_n_c_wis_desc =
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(params);
|
||||
|
||||
const auto wei_g_k_c_xs_desc =
|
||||
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(params);
|
||||
|
||||
const auto out_g_n_k_wos_desc =
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params);
|
||||
|
||||
// Create tensors using tensor descriptors (supports multiple layouts)
|
||||
Tensor<InDataType> input(in_g_n_c_wis_desc);
|
||||
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
|
||||
Tensor<OutDataType> output(out_g_n_k_wos_desc);
|
||||
|
||||
// Allocate device memory
|
||||
DeviceMem input_dev(input.mData.size() * sizeof(InDataType));
|
||||
DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType));
|
||||
DeviceMem output_dev(output.mData.size() * sizeof(OutDataType));
|
||||
|
||||
// Initialize and copy tensors based on kernel type
|
||||
if(kernel_type == ConvKernelType::Forward)
|
||||
{
|
||||
initialize_and_copy_tensor(input, input_dev);
|
||||
initialize_and_copy_tensor(weight, weight_dev);
|
||||
}
|
||||
else if(kernel_type == ConvKernelType::BackwardData)
|
||||
{
|
||||
initialize_and_copy_tensor(weight, weight_dev);
|
||||
initialize_and_copy_tensor(output, output_dev);
|
||||
}
|
||||
else // BackwardWeight
|
||||
{
|
||||
initialize_and_copy_tensor(input, input_dev);
|
||||
initialize_and_copy_tensor(output, output_dev);
|
||||
}
|
||||
|
||||
// Dispatch to appropriate implementation with layout types
|
||||
if(kernel_type == ConvKernelType::Forward)
|
||||
{
|
||||
return test_conv_fwd_impl<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(
|
||||
params, input, weight, input_dev, weight_dev, output_dev);
|
||||
}
|
||||
else if(kernel_type == ConvKernelType::BackwardData)
|
||||
{
|
||||
return test_conv_bwd_data_impl<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(
|
||||
params, weight, output, weight_dev, output_dev, input_dev);
|
||||
}
|
||||
else // BackwardWeight
|
||||
{
|
||||
return test_conv_bwd_weight_impl<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(
|
||||
params, input, output, input_dev, output_dev, weight_dev);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace ck
|
||||
224
test/gpu_reference/test_gpu_reference_conv_bwd_data.cpp
Normal file
224
test/gpu_reference/test_gpu_reference_conv_bwd_data.cpp
Normal file
@@ -0,0 +1,224 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "gpu_reference_utils.hpp"
|
||||
|
||||
using namespace ck;
|
||||
using ck::test::ConvKernelType;
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP16Small)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_small();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP32Medium)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_medium();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv1DFP16)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_1d();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv3DFP16Small)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP16Stride2)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_stride2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedG2)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedG4)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g4();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NHWGC,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NHWGK>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NHWGC,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NHWGK>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NGKHW>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NGKHW>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKCYX,
|
||||
tensor_layout::convolution::NGKHW>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKCYX,
|
||||
tensor_layout::convolution::NGKHW>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NDHWGC,
|
||||
tensor_layout::convolution::GKZYXC,
|
||||
tensor_layout::convolution::NDHWGK>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NDHWGC,
|
||||
tensor_layout::convolution::GKZYXC,
|
||||
tensor_layout::convolution::NDHWGK>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCDHW,
|
||||
tensor_layout::convolution::GKCZYX,
|
||||
tensor_layout::convolution::NGKDHW>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdData, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCDHW,
|
||||
tensor_layout::convolution::GKCZYX,
|
||||
tensor_layout::convolution::NGKDHW>(
|
||||
params, ConvKernelType::BackwardData);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
224
test/gpu_reference/test_gpu_reference_conv_bwd_weight.cpp
Normal file
224
test/gpu_reference/test_gpu_reference_conv_bwd_weight.cpp
Normal file
@@ -0,0 +1,224 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "gpu_reference_utils.hpp"
|
||||
|
||||
using namespace ck;
|
||||
using ck::test::ConvKernelType;
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP16Small)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_small();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP32Medium)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_medium();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv1DFP16)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_1d();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv3DFP16Small)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP16Stride2)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_stride2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedG2)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedG4)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g4();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NHWGC,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NHWGK>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NHWGC,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NHWGK>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NGKHW>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NGKHW>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKCYX,
|
||||
tensor_layout::convolution::NGKHW>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result = test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKCYX,
|
||||
tensor_layout::convolution::NGKHW>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NDHWGC,
|
||||
tensor_layout::convolution::GKZYXC,
|
||||
tensor_layout::convolution::NDHWGK>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NDHWGC,
|
||||
tensor_layout::convolution::GKZYXC,
|
||||
tensor_layout::convolution::NDHWGK>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCDHW,
|
||||
tensor_layout::convolution::GKCZYX,
|
||||
tensor_layout::convolution::NGKDHW>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvBwdWeight, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCDHW,
|
||||
tensor_layout::convolution::GKCZYX,
|
||||
tensor_layout::convolution::NGKDHW>(
|
||||
params, ConvKernelType::BackwardWeight);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
222
test/gpu_reference/test_gpu_reference_conv_fwd.cpp
Normal file
222
test/gpu_reference/test_gpu_reference_conv_fwd.cpp
Normal file
@@ -0,0 +1,222 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "gpu_reference_utils.hpp"
|
||||
|
||||
using namespace ck;
|
||||
using ck::test::ConvKernelType;
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP16Small)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_small();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP32Medium)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_medium();
|
||||
bool result = test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv1DFP16)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_1d();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv3DFP16Small)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP16Stride2)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_stride2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP16GroupedG2)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP32GroupedG4)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g4();
|
||||
bool result = test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NHWGC,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NHWGK>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NHWGC,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NHWGK>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKYXC,
|
||||
tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKCYX,
|
||||
tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_2d_grouped_g2();
|
||||
bool result =
|
||||
test::test_conv_gpu_ref<2,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCHW,
|
||||
tensor_layout::convolution::GKCYX,
|
||||
tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NDHWGC,
|
||||
tensor_layout::convolution::GKZYXC,
|
||||
tensor_layout::convolution::NDHWGK>(
|
||||
params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NDHWGC,
|
||||
tensor_layout::convolution::GKZYXC,
|
||||
tensor_layout::convolution::NDHWGK>(
|
||||
params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
tensor_layout::convolution::NGCDHW,
|
||||
tensor_layout::convolution::GKCZYX,
|
||||
tensor_layout::convolution::NGKDHW>(
|
||||
params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
TEST(GpuReferenceConvFwd, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
|
||||
{
|
||||
auto params = test::conv_test_shapes::get_3d_small();
|
||||
// Modify to be grouped (G=2)
|
||||
params.G_ = 2;
|
||||
params.C_ = 16; // 8 per group
|
||||
params.K_ = 16; // 8 per group
|
||||
|
||||
bool result = test::test_conv_gpu_ref<3,
|
||||
half_t,
|
||||
half_t,
|
||||
half_t,
|
||||
tensor_layout::convolution::NGCDHW,
|
||||
tensor_layout::convolution::GKCZYX,
|
||||
tensor_layout::convolution::NGKDHW>(
|
||||
params, ConvKernelType::Forward);
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
@@ -73,7 +73,7 @@ bool RunConvBwdDataTest(const ck::utils::conv::ConvParam& param, ck::index_t spl
|
||||
InLayout,
|
||||
DataType,
|
||||
DataType,
|
||||
DataType>(true, // do_verification
|
||||
DataType>(2, // do_verification
|
||||
1, // init_method
|
||||
false, // do_log
|
||||
false, // time_kernel
|
||||
|
||||
@@ -47,7 +47,7 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test
|
||||
DataType,
|
||||
DataType,
|
||||
DataType>(
|
||||
true, // do_verification
|
||||
2, // do_verification
|
||||
1, // init_method: integer value
|
||||
false, // do_log
|
||||
false, // time_kernel
|
||||
|
||||
@@ -73,7 +73,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType>(
|
||||
true, // do_verification
|
||||
2, // do_verification
|
||||
1, // init_method: integer value
|
||||
false, // do_log
|
||||
false, // time_kernel
|
||||
|
||||
@@ -80,7 +80,7 @@ bool RunConvBwdWeightTest(const ck::utils::conv::ConvParam& param, ck::index_t s
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType>(
|
||||
true, // do_verification
|
||||
2, // do_verification
|
||||
1, // init_method
|
||||
false, // do_log
|
||||
false, // time_kernel
|
||||
|
||||
@@ -46,7 +46,7 @@ class TestGroupedConvndFwd : public ::testing::Test
|
||||
DataType,
|
||||
DataType,
|
||||
IndexType>(
|
||||
true, // do_verification
|
||||
2, // do_verification
|
||||
1, // init_method: integer value
|
||||
false, // do_log
|
||||
false, // time_kernel
|
||||
|
||||
@@ -77,7 +77,7 @@ bool RunConvTest(const ck::utils::conv::ConvParam& param)
|
||||
DataType,
|
||||
DataType,
|
||||
DataType,
|
||||
IndexType>(true, // do_verification
|
||||
IndexType>(2, // do_verification
|
||||
1, // init_method
|
||||
false, // do_log
|
||||
false, // time_kernel
|
||||
|
||||
Reference in New Issue
Block a user