mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-21 07:37:38 +00:00
* test_convnd_fwd
* test_convnd_bwd_data
* test_conv_bwd_data_scale
* test_grouped_convnd_fwd_clamp
* test_grouped_convnd_fwd_scale
* multiple A/B tensors and D tensor for fwd GPU ref
* test_grouped_convnd_fwd_scaleadd_ab
* test_grouped_convnd_fwd_bias_clamp
* test_grouped_convnd_fwd_bilinear
* test_grouped_convnd_fwd_gk_bias_clamp
* Extend GPU reference to enable batchnorm epilogue
* test_grouped_convnd_fwd{,_gk}_bias_bnorm_clamp
* test_grouped_conv_bwd_data_bilinear
* test_grouped_convnd_bwd_weight_bilinear
* Add missing template instantiation
* Perform operations in float in reference
* Slightly increase tolerance for batchnorm profiler
* Revert "Slightly increase tolerance for batchnorm profiler"
This reverts commit a3b2475229.
* Revert "test_grouped_convnd_fwd{,_gk}_bias_bnorm_clamp"
This reverts commit 6da4576060.
* Revert "Extend GPU reference to enable batchnorm epilogue"
This reverts commit e2f75fa10e.
* Clarify variable names
* Refactor elementwise ops into helper functions
* Make helpers C++17-compatible
611 lines
28 KiB
C++
611 lines
28 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/host_utility/hip_check_error.hpp"
|
|
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
|
#include "ck/library/utility/check_err.hpp"
|
|
#include "ck/library/utility/device_memory.hpp"
|
|
#include "ck/library/utility/host_tensor.hpp"
|
|
#include "ck/library/utility/host_tensor_generator.hpp"
|
|
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
|
|
|
// CPU references
|
|
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
|
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
|
|
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
|
|
|
|
// GPU references
|
|
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
|
|
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
|
|
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
|
|
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
|
|
|
#include "common_test_params.hpp"
|
|
|
|
namespace ck {
|
|
namespace test {
|
|
|
|
enum class ConvKernelType
|
|
{
|
|
Forward,
|
|
BackwardData,
|
|
BackwardWeight
|
|
};
|
|
|
|
// Helper function to initialize and copy a tensor to device
|
|
template <typename DataType>
|
|
void initialize_and_copy_tensor(Tensor<DataType>& host_tensor, DeviceMem& device_mem)
|
|
{
|
|
host_tensor.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
|
|
device_mem.ToDevice(host_tensor.mData.data());
|
|
}
|
|
|
|
// Helper to get default layout types based on NDimSpatial
|
|
template <index_t NDimSpatial>
|
|
struct DefaultConvLayouts
|
|
{
|
|
using InLayout = std::conditional_t<NDimSpatial == 3,
|
|
tensor_layout::convolution::GNCDHW,
|
|
std::conditional_t<NDimSpatial == 2,
|
|
tensor_layout::convolution::GNCHW,
|
|
tensor_layout::convolution::GNCW>>;
|
|
using WeiLayout = std::conditional_t<NDimSpatial == 3,
|
|
tensor_layout::convolution::GKCZYX,
|
|
std::conditional_t<NDimSpatial == 2,
|
|
tensor_layout::convolution::GKCYX,
|
|
tensor_layout::convolution::GKCX>>;
|
|
using OutLayout = std::conditional_t<NDimSpatial == 3,
|
|
tensor_layout::convolution::GNKDHW,
|
|
std::conditional_t<NDimSpatial == 2,
|
|
tensor_layout::convolution::GNKHW,
|
|
tensor_layout::convolution::GNKW>>;
|
|
};
|
|
|
|
// Forward convolution implementation
|
|
template <index_t NDimSpatial,
|
|
typename InDataType,
|
|
typename WeiDataType,
|
|
typename OutDataType,
|
|
typename InLayout,
|
|
typename WeiLayout,
|
|
typename OutLayout>
|
|
bool test_conv_fwd_impl(const ck::utils::conv::ConvParam& params,
|
|
const Tensor<InDataType>& input_cpu,
|
|
const Tensor<WeiDataType>& weight_cpu,
|
|
DeviceMem& input_dev,
|
|
DeviceMem& weight_dev,
|
|
DeviceMem& output_dev)
|
|
{
|
|
using InElementOp = tensor_operation::element_wise::PassThrough;
|
|
using WeiElementOp = tensor_operation::element_wise::PassThrough;
|
|
using OutElementOp = tensor_operation::element_wise::PassThrough;
|
|
|
|
// Call GPU reference with ConvParam directly
|
|
ref::naive_conv_fwd<InLayout,
|
|
WeiLayout,
|
|
OutLayout,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp>(
|
|
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
|
|
params);
|
|
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Run CPU reference
|
|
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
|
|
params.conv_filter_strides_.end());
|
|
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
|
|
params.conv_filter_dilations_.end());
|
|
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
|
|
params.input_left_pads_.end());
|
|
|
|
Tensor<InDataType> input_ref = input_cpu;
|
|
Tensor<WeiDataType> weight_ref = weight_cpu;
|
|
Tensor<OutDataType> output_ref(
|
|
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params));
|
|
|
|
auto ref_conv = tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp>();
|
|
auto ref_invoker = ref_conv.MakeInvoker();
|
|
auto ref_arg = ref_conv.MakeArgument(input_ref,
|
|
weight_ref,
|
|
output_ref,
|
|
strides_long,
|
|
dilations_long,
|
|
pads_long,
|
|
pads_long,
|
|
InElementOp{},
|
|
WeiElementOp{},
|
|
OutElementOp{});
|
|
ref_invoker.Run(ref_arg);
|
|
|
|
// Copy result from device and compare
|
|
Tensor<OutDataType> output_gpu(output_ref.mDesc);
|
|
output_dev.FromDevice(output_gpu.mData.data());
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Compare results
|
|
return ck::utils::check_err(output_gpu, output_ref);
|
|
}
|
|
|
|
// Backward data convolution implementation
|
|
template <index_t NDimSpatial,
|
|
typename InDataType,
|
|
typename WeiDataType,
|
|
typename OutDataType,
|
|
typename InLayout,
|
|
typename WeiLayout,
|
|
typename OutLayout>
|
|
bool test_conv_bwd_data_impl(const ck::utils::conv::ConvParam& params,
|
|
const Tensor<WeiDataType>& weight_cpu,
|
|
const Tensor<OutDataType>& output_cpu,
|
|
DeviceMem& weight_dev,
|
|
DeviceMem& output_dev,
|
|
DeviceMem& input_dev)
|
|
{
|
|
using InElementOp = tensor_operation::element_wise::PassThrough;
|
|
using WeiElementOp = tensor_operation::element_wise::PassThrough;
|
|
using OutElementOp = tensor_operation::element_wise::PassThrough;
|
|
|
|
// Call GPU reference with ConvParam directly
|
|
ref::naive_conv_bwd_data<InLayout,
|
|
WeiLayout,
|
|
OutLayout,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp>(
|
|
reinterpret_cast<InDataType*>(input_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const OutDataType*>(output_dev.GetDeviceBuffer()),
|
|
params);
|
|
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Run CPU reference
|
|
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
|
|
params.conv_filter_strides_.end());
|
|
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
|
|
params.conv_filter_dilations_.end());
|
|
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
|
|
params.input_left_pads_.end());
|
|
|
|
Tensor<InDataType> input_ref(
|
|
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(params));
|
|
Tensor<WeiDataType> weight_ref = weight_cpu;
|
|
Tensor<OutDataType> output_ref = output_cpu;
|
|
|
|
auto ref_conv = tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp>();
|
|
auto ref_invoker = ref_conv.MakeInvoker();
|
|
auto ref_arg = ref_conv.MakeArgument(input_ref,
|
|
weight_ref,
|
|
output_ref,
|
|
strides_long,
|
|
dilations_long,
|
|
pads_long,
|
|
pads_long,
|
|
InElementOp{},
|
|
WeiElementOp{},
|
|
OutElementOp{});
|
|
ref_invoker.Run(ref_arg);
|
|
|
|
// Copy result from device and compare
|
|
Tensor<InDataType> input_gpu(input_ref.mDesc);
|
|
input_dev.FromDevice(input_gpu.mData.data());
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Compare results
|
|
return ck::utils::check_err(input_gpu, input_ref);
|
|
}
|
|
|
|
// Backward weight convolution implementation
|
|
template <index_t NDimSpatial,
|
|
typename InDataType,
|
|
typename WeiDataType,
|
|
typename OutDataType,
|
|
typename InLayout,
|
|
typename WeiLayout,
|
|
typename OutLayout>
|
|
bool test_conv_bwd_weight_impl(const ck::utils::conv::ConvParam& params,
|
|
const Tensor<InDataType>& input_cpu,
|
|
const Tensor<OutDataType>& output_cpu,
|
|
DeviceMem& input_dev,
|
|
DeviceMem& output_dev,
|
|
DeviceMem& weight_dev)
|
|
{
|
|
using InElementOp = tensor_operation::element_wise::PassThrough;
|
|
using WeiElementOp = tensor_operation::element_wise::PassThrough;
|
|
using OutElementOp = tensor_operation::element_wise::PassThrough;
|
|
|
|
// Call GPU reference with ConvParam directly
|
|
ref::naive_conv_bwd_weight<InLayout,
|
|
WeiLayout,
|
|
OutLayout,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp>(
|
|
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<WeiDataType*>(weight_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const OutDataType*>(output_dev.GetDeviceBuffer()),
|
|
params);
|
|
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Run CPU reference
|
|
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
|
|
params.conv_filter_strides_.end());
|
|
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
|
|
params.conv_filter_dilations_.end());
|
|
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
|
|
params.input_left_pads_.end());
|
|
|
|
Tensor<InDataType> input_ref = input_cpu;
|
|
Tensor<WeiDataType> weight_ref(
|
|
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(params));
|
|
Tensor<OutDataType> output_ref = output_cpu;
|
|
|
|
auto ref_conv = tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp>();
|
|
auto ref_invoker = ref_conv.MakeInvoker();
|
|
auto ref_arg = ref_conv.MakeArgument(input_ref,
|
|
weight_ref,
|
|
output_ref,
|
|
strides_long,
|
|
dilations_long,
|
|
pads_long,
|
|
pads_long,
|
|
InElementOp{},
|
|
WeiElementOp{},
|
|
OutElementOp{});
|
|
ref_invoker.Run(ref_arg);
|
|
|
|
// Copy result from device and compare
|
|
Tensor<WeiDataType> weight_gpu(weight_ref.mDesc);
|
|
weight_dev.FromDevice(weight_gpu.mData.data());
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Compare results
|
|
return ck::utils::check_err(weight_gpu, weight_ref);
|
|
}
|
|
|
|
// Main test function - dispatches to specific implementations
|
|
template <index_t NDimSpatial,
|
|
typename InDataType,
|
|
typename WeiDataType,
|
|
typename OutDataType,
|
|
typename InLayout = typename DefaultConvLayouts<NDimSpatial>::InLayout,
|
|
typename WeiLayout = typename DefaultConvLayouts<NDimSpatial>::WeiLayout,
|
|
typename OutLayout = typename DefaultConvLayouts<NDimSpatial>::OutLayout>
|
|
bool test_conv_gpu_ref(const ck::utils::conv::ConvParam& params, ConvKernelType kernel_type)
|
|
{
|
|
// Create tensor descriptors using the specified layouts
|
|
const auto in_g_n_c_wis_desc =
|
|
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(params);
|
|
|
|
const auto wei_g_k_c_xs_desc =
|
|
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(params);
|
|
|
|
const auto out_g_n_k_wos_desc =
|
|
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params);
|
|
|
|
// Create tensors using tensor descriptors (supports multiple layouts)
|
|
Tensor<InDataType> input(in_g_n_c_wis_desc);
|
|
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
|
|
Tensor<OutDataType> output(out_g_n_k_wos_desc);
|
|
|
|
// Allocate device memory
|
|
DeviceMem input_dev(input.mData.size() * sizeof(InDataType));
|
|
DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType));
|
|
DeviceMem output_dev(output.mData.size() * sizeof(OutDataType));
|
|
|
|
// Initialize and copy tensors based on kernel type
|
|
if(kernel_type == ConvKernelType::Forward)
|
|
{
|
|
initialize_and_copy_tensor(input, input_dev);
|
|
initialize_and_copy_tensor(weight, weight_dev);
|
|
}
|
|
else if(kernel_type == ConvKernelType::BackwardData)
|
|
{
|
|
initialize_and_copy_tensor(weight, weight_dev);
|
|
initialize_and_copy_tensor(output, output_dev);
|
|
}
|
|
else // BackwardWeight
|
|
{
|
|
initialize_and_copy_tensor(input, input_dev);
|
|
initialize_and_copy_tensor(output, output_dev);
|
|
}
|
|
|
|
// Dispatch to appropriate implementation with layout types
|
|
if(kernel_type == ConvKernelType::Forward)
|
|
{
|
|
return test_conv_fwd_impl<NDimSpatial,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InLayout,
|
|
WeiLayout,
|
|
OutLayout>(
|
|
params, input, weight, input_dev, weight_dev, output_dev);
|
|
}
|
|
else if(kernel_type == ConvKernelType::BackwardData)
|
|
{
|
|
return test_conv_bwd_data_impl<NDimSpatial,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InLayout,
|
|
WeiLayout,
|
|
OutLayout>(
|
|
params, weight, output, weight_dev, output_dev, input_dev);
|
|
}
|
|
else // BackwardWeight
|
|
{
|
|
return test_conv_bwd_weight_impl<NDimSpatial,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InLayout,
|
|
WeiLayout,
|
|
OutLayout>(
|
|
params, input, output, input_dev, output_dev, weight_dev);
|
|
}
|
|
}
|
|
|
|
// Forward convolution with D tensor support
|
|
template <index_t NDimSpatial,
|
|
typename InDataType,
|
|
typename WeiDataType,
|
|
typename OutDataType,
|
|
typename InLayout,
|
|
typename WeiLayout,
|
|
typename OutLayout,
|
|
typename OutElementOp>
|
|
bool test_conv_fwd_with_d_tensor_impl(const ck::utils::conv::ConvParam& params,
|
|
const Tensor<InDataType>& input_cpu,
|
|
const Tensor<WeiDataType>& weight_cpu,
|
|
const Tensor<OutDataType>& d_cpu,
|
|
DeviceMem& input_dev,
|
|
DeviceMem& weight_dev,
|
|
DeviceMem& d_dev,
|
|
DeviceMem& output_dev,
|
|
OutElementOp out_element_op)
|
|
{
|
|
using InElementOp = tensor_operation::element_wise::PassThrough;
|
|
using WeiElementOp = tensor_operation::element_wise::PassThrough;
|
|
|
|
// Create D tensor lengths and strides for GPU reference
|
|
std::vector<index_t> d_lengths_vec(NDimSpatial + 3);
|
|
d_lengths_vec[0] = params.G_;
|
|
d_lengths_vec[1] = params.N_;
|
|
d_lengths_vec[2] = params.K_;
|
|
for(index_t i = 0; i < NDimSpatial; ++i)
|
|
{
|
|
d_lengths_vec[3 + i] = static_cast<index_t>(params.output_spatial_lengths_[i]);
|
|
}
|
|
|
|
std::vector<index_t> d_strides_vec =
|
|
ref::compute_conv_tensor_strides<OutLayout>(d_lengths_vec, params.num_dim_spatial_);
|
|
|
|
std::array<const OutDataType*, 1> d_ptrs = {
|
|
reinterpret_cast<const OutDataType*>(d_dev.GetDeviceBuffer())};
|
|
std::array<std::vector<index_t>, 1> d_lengths = {d_lengths_vec};
|
|
std::array<std::vector<index_t>, 1> d_strides = {d_strides_vec};
|
|
|
|
// Call GPU reference with D tensor
|
|
std::array<const InDataType*, 1> in_ptrs = {
|
|
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer())};
|
|
std::array<const WeiDataType*, 1> wei_ptrs = {
|
|
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer())};
|
|
|
|
ref::naive_conv_fwd_multi_abd<0,
|
|
0,
|
|
1,
|
|
InLayout,
|
|
WeiLayout,
|
|
OutLayout,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp,
|
|
OutDataType>( // Explicitly specify TD = OutDataType
|
|
in_ptrs,
|
|
wei_ptrs,
|
|
d_ptrs,
|
|
reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
|
|
params,
|
|
d_lengths,
|
|
d_strides,
|
|
InElementOp{},
|
|
WeiElementOp{},
|
|
out_element_op);
|
|
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Run CPU reference
|
|
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
|
|
params.conv_filter_strides_.end());
|
|
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
|
|
params.conv_filter_dilations_.end());
|
|
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
|
|
params.input_left_pads_.end());
|
|
|
|
Tensor<InDataType> input_ref = input_cpu;
|
|
Tensor<WeiDataType> weight_ref = weight_cpu;
|
|
Tensor<OutDataType> output_ref(
|
|
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params));
|
|
|
|
std::array<Tensor<OutDataType>, 1> d_tensors_ref = {d_cpu};
|
|
|
|
auto ref_conv = tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp,
|
|
0, // NumA
|
|
0, // NumB
|
|
1 // NumD
|
|
>();
|
|
auto ref_invoker = ref_conv.MakeInvoker();
|
|
auto ref_arg = ref_conv.MakeArgument(input_ref,
|
|
weight_ref,
|
|
output_ref,
|
|
strides_long,
|
|
dilations_long,
|
|
pads_long,
|
|
pads_long,
|
|
InElementOp{},
|
|
WeiElementOp{},
|
|
out_element_op,
|
|
{}, // A tensors
|
|
{}, // B tensors
|
|
d_tensors_ref);
|
|
ref_invoker.Run(ref_arg);
|
|
|
|
// Copy result from device and compare
|
|
Tensor<OutDataType> output_gpu(output_ref.mDesc);
|
|
output_dev.FromDevice(output_gpu.mData.data());
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Compare results
|
|
return ck::utils::check_err(output_gpu, output_ref);
|
|
}
|
|
|
|
// Forward convolution with multiple A/B tensor support
|
|
template <index_t NDimSpatial,
|
|
typename InDataType,
|
|
typename WeiDataType,
|
|
typename OutDataType,
|
|
typename InLayout,
|
|
typename WeiLayout,
|
|
typename OutLayout,
|
|
typename InElementOp,
|
|
typename WeiElementOp>
|
|
bool test_conv_fwd_with_multi_ab_impl(const ck::utils::conv::ConvParam& params,
|
|
const Tensor<InDataType>& input_cpu,
|
|
const Tensor<WeiDataType>& weight_cpu,
|
|
const Tensor<InDataType>& a_extra_cpu,
|
|
const Tensor<WeiDataType>& b_extra_cpu,
|
|
DeviceMem& input_dev,
|
|
DeviceMem& weight_dev,
|
|
DeviceMem& a_extra_dev,
|
|
DeviceMem& b_extra_dev,
|
|
DeviceMem& output_dev,
|
|
InElementOp in_element_op,
|
|
WeiElementOp wei_element_op)
|
|
{
|
|
using OutElementOp = tensor_operation::element_wise::PassThrough;
|
|
|
|
// Call GPU reference with extra A and B tensors
|
|
std::array<const InDataType*, 2> in_ptrs = {
|
|
reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const InDataType*>(a_extra_dev.GetDeviceBuffer())};
|
|
std::array<const WeiDataType*, 2> wei_ptrs = {
|
|
reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
|
|
reinterpret_cast<const WeiDataType*>(b_extra_dev.GetDeviceBuffer())};
|
|
std::array<const OutDataType*, 0> d_ptrs = {};
|
|
std::array<std::vector<index_t>, 0> d_lengths = {};
|
|
std::array<std::vector<index_t>, 0> d_strides = {};
|
|
|
|
ref::naive_conv_fwd_multi_abd<1, 1, 0, InLayout, WeiLayout, OutLayout>(
|
|
in_ptrs,
|
|
wei_ptrs,
|
|
d_ptrs,
|
|
reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
|
|
params,
|
|
d_lengths,
|
|
d_strides,
|
|
in_element_op,
|
|
wei_element_op,
|
|
OutElementOp{});
|
|
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Run CPU reference
|
|
std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
|
|
params.conv_filter_strides_.end());
|
|
std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
|
|
params.conv_filter_dilations_.end());
|
|
std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
|
|
params.input_left_pads_.end());
|
|
|
|
Tensor<InDataType> input_ref = input_cpu;
|
|
Tensor<WeiDataType> weight_ref = weight_cpu;
|
|
Tensor<OutDataType> output_ref(
|
|
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params));
|
|
|
|
std::array<Tensor<InDataType>, 1> a_tensors_ref = {a_extra_cpu};
|
|
std::array<Tensor<WeiDataType>, 1> b_tensors_ref = {b_extra_cpu};
|
|
|
|
auto ref_conv = tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
|
InDataType,
|
|
WeiDataType,
|
|
OutDataType,
|
|
InElementOp,
|
|
WeiElementOp,
|
|
OutElementOp,
|
|
1, // NumA
|
|
1, // NumB
|
|
0 // NumD
|
|
>();
|
|
auto ref_invoker = ref_conv.MakeInvoker();
|
|
auto ref_arg = ref_conv.MakeArgument(input_ref,
|
|
weight_ref,
|
|
output_ref,
|
|
strides_long,
|
|
dilations_long,
|
|
pads_long,
|
|
pads_long,
|
|
in_element_op,
|
|
wei_element_op,
|
|
OutElementOp{},
|
|
a_tensors_ref,
|
|
b_tensors_ref,
|
|
{});
|
|
ref_invoker.Run(ref_arg);
|
|
|
|
// Copy result from device and compare
|
|
Tensor<OutDataType> output_gpu(output_ref.mDesc);
|
|
output_dev.FromDevice(output_gpu.mData.data());
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
|
|
// Compare results
|
|
return ck::utils::check_err(output_gpu, output_ref);
|
|
}
|
|
|
|
} // namespace test
|
|
} // namespace ck
|