mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 13:41:24 +00:00
Unified conv3D API + support for all data types. (#133)
* Convolution ND * Code unification across dimensions for generating tensor descriptors. * Example * Instances * Move convnd f32 instance file to comply with repo structure. * Conv 1D tensor layouts. * Formatting and use ReferenceConv * Reference ConvFwd supporting 1D and 2D convolution. * Debug printing TensorLayout name. * Conv fwd 1D instance f32 * Refactor conv ND example. Needed to support various conv dimensio. Needed to support various conv dimensions * Rename conv nd example director to prevent conflicts. * Refactor some common utility to single file. Plus some tests. * Refactor GetHostTensorDescriptor + UT. * Add 1D test case. * Test reference convolution 1d/2d * Remove some leftovers. * Fix convolution example error for 1D * Refactor test check errors utility function. * Test Conv2D Fwd XDL * More UT for 1D case. * Parameterize input & weight initializers. * Rename example to prevent conflicts. * Split convnd instance into separate files for 1d/2d * Address review comments. * Fix data type for flops/gbytes calculations. * Assign example number 11. * 3D cases for convolution utility functions. * 3D reference convolution. * Add support for 3D convolution. * Check for inputs bigger than 2GB. * Formatting * Support for bf16/f16/f32/i8 - conv instances + UT. * Use check_err from test_util.hpp. * Split convnd test into separate files for each dim. * Fix data generation and use proper instances. * Formatting * Skip tensor initialization if not necessary. * Fix CMakefiles. * Remove redundant conv2d_fwd test. * Lower problem size for conv3D UT. * 3D case for convnd example. * Remove leftovers after merge. * Add Conv Specialization string to GetTypeString * Skip instance causing numerical errors. * Small fixes. * Remove redundant includes. * Fix namespace name error. * Script for automatic testing and logging convolution fwd UTs * Comment out numactl cmd. * Refine weights initalization and relax rtol for fp16 * Fix weights initialization for int8. * Add type_convert when store output in ref conv 1D. * Get back old conv2d_fwd_xdl operation. * Silence conv debug print. * format * clean * clean * Fix merge. * Fix namespace for check_err Co-authored-by: Adam Osewski <aosewski@amd.com> Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
@@ -186,6 +186,28 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
|
||||
return HostTensorDescriptor(
|
||||
dims, std::vector<std::size_t>{C * dims[2] * dims[3], 1, dims[3] * C, C});
|
||||
}
|
||||
// 3D
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NCDHW>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KCZYX>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NKDHW>::value)
|
||||
{
|
||||
|
||||
return HostTensorDescriptor(dims,
|
||||
std::vector<std::size_t>{C * dims[2] * dims[3] * dims[4],
|
||||
dims[2] * dims[3] * dims[4],
|
||||
dims[3] * dims[4],
|
||||
dims[4],
|
||||
1});
|
||||
}
|
||||
else if constexpr(std::is_same<TensorLayout, ck::tensor_layout::convolution::NDHWC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::KZYXC>::value ||
|
||||
std::is_same<TensorLayout, ck::tensor_layout::convolution::NDHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
dims,
|
||||
std::vector<std::size_t>{
|
||||
C * dims[2] * dims[3] * dims[4], 1, C * dims[3] * dims[4], C * dims[4], C});
|
||||
}
|
||||
|
||||
std::stringstream err_msg;
|
||||
err_msg << "Unsupported data layout provided: " << layout << "!";
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#ifndef CONVOLUTION_FORWARD_SPECIALIZATION
|
||||
#define CONVOLUTION_FORWARD_SPECIALIZATION
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
@@ -13,6 +15,18 @@ enum ConvolutionForwardSpecialization_t
|
||||
OddC,
|
||||
};
|
||||
|
||||
inline std::string getConvFwdSpecializationStr(const ConvolutionForwardSpecialization_t& s)
|
||||
{
|
||||
switch(s)
|
||||
{
|
||||
case Default: return "Default";
|
||||
case Filter1x1Pad0: return "Filter1x1Pad0";
|
||||
case Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
|
||||
case OddC: return "OddC";
|
||||
default: return "Unrecognized specialization!";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
|
||||
@@ -875,7 +875,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
<< BlockSize << ", "
|
||||
<< MPerBlock << ", "
|
||||
<< NPerBlock << ", "
|
||||
<< K0PerBlock
|
||||
<< K0PerBlock << ", "
|
||||
<< getConvFwdSpecializationStr(ConvForwardSpecialization)
|
||||
<< ">";
|
||||
// clang-format on
|
||||
|
||||
|
||||
@@ -466,7 +466,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
|
||||
arg.b_grid_desc_k0_n_k1_,
|
||||
arg.c_grid_desc_m_n_,
|
||||
@@ -708,7 +707,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
<< BlockSize << ", "
|
||||
<< MPerBlock << ", "
|
||||
<< NPerBlock << ", "
|
||||
<< K0PerBlock
|
||||
<< K0PerBlock << ", "
|
||||
<< getConvFwdSpecializationStr(ConvForwardSpecialization)
|
||||
<< ">";
|
||||
// clang-format on
|
||||
|
||||
|
||||
@@ -367,6 +367,155 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
}
|
||||
}
|
||||
|
||||
template <ck::index_t NDim, typename std::enable_if<NDim == 3, bool>::type = false>
|
||||
static auto GetInputTensorDescriptor(ck::index_t N,
|
||||
ck::index_t C,
|
||||
ck::index_t gemm_m,
|
||||
ck::index_t gemm_k,
|
||||
ck::index_t gemm_m_pad,
|
||||
const std::vector<ck::index_t>& input_spatial_lengths,
|
||||
const std::vector<ck::index_t>& filter_spatial_lengths,
|
||||
const std::vector<ck::index_t>& output_spatial_lengths,
|
||||
const std::vector<ck::index_t>& conv_filter_strides,
|
||||
const std::vector<ck::index_t>& conv_filter_dilations,
|
||||
const std::vector<ck::index_t>& input_left_pads,
|
||||
const std::vector<ck::index_t>& input_right_pads)
|
||||
{
|
||||
const ck::index_t gemm_k0 = gemm_k / GemmK1Number;
|
||||
const index_t Di = input_spatial_lengths[0];
|
||||
const index_t Hi = input_spatial_lengths[1];
|
||||
const index_t Wi = input_spatial_lengths[2];
|
||||
|
||||
const index_t Do = output_spatial_lengths[0];
|
||||
const index_t Ho = output_spatial_lengths[1];
|
||||
const index_t Wo = output_spatial_lengths[2];
|
||||
|
||||
const index_t ConvStrideD = conv_filter_strides[0];
|
||||
const index_t ConvStrideH = conv_filter_strides[1];
|
||||
const index_t ConvStrideW = conv_filter_strides[2];
|
||||
|
||||
if constexpr(ConvForwardSpecialization ==
|
||||
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
|
||||
{
|
||||
const auto in_gemmmraw_gemmk_grid_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k));
|
||||
|
||||
// in_gemmk0_gemmm_gemmk1_grid_desc
|
||||
return transform_tensor_descriptor(
|
||||
in_gemmmraw_gemmk_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(gemm_k0, GemmK1Number)),
|
||||
make_right_pad_transform(gemm_m, gemm_m_pad)),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
|
||||
}
|
||||
else if constexpr(ConvForwardSpecialization ==
|
||||
ConvolutionForwardSpecialization_t::Filter1x1Pad0)
|
||||
{
|
||||
const auto in_n_di_hi_wi_c_grid_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
|
||||
|
||||
const auto in_n_do_ho_wo_c_grid_desc = transform_tensor_descriptor(
|
||||
in_n_di_hi_wi_c_grid_desc,
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
|
||||
make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
|
||||
make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
|
||||
make_pass_through_transform(C)),
|
||||
make_tuple(
|
||||
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(
|
||||
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
|
||||
|
||||
const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
|
||||
in_n_do_ho_wo_c_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(gemm_k0, GemmK1Number)),
|
||||
make_merge_transform(make_tuple(N, Do, Ho, Wo))),
|
||||
make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}),
|
||||
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
|
||||
|
||||
// in_gemmk0_gemmm_gemmk1_grid_desc
|
||||
return transform_tensor_descriptor(
|
||||
in_gemmk0_gemmmraw_gemmk1_grid_desc,
|
||||
make_tuple(make_pass_through_transform(gemm_k0),
|
||||
make_right_pad_transform(gemm_m, gemm_m_pad),
|
||||
make_pass_through_transform(GemmK1Number)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
const index_t Z = filter_spatial_lengths[0];
|
||||
const index_t Y = filter_spatial_lengths[1];
|
||||
const index_t X = filter_spatial_lengths[2];
|
||||
|
||||
const index_t ConvDilationD = conv_filter_dilations[0];
|
||||
const index_t ConvDilationH = conv_filter_dilations[1];
|
||||
const index_t ConvDilationW = conv_filter_dilations[2];
|
||||
|
||||
const index_t InLeftPadD = input_left_pads[0];
|
||||
const index_t InLeftPadH = input_left_pads[1];
|
||||
const index_t InLeftPadW = input_left_pads[2];
|
||||
|
||||
const index_t InRightPadD = input_right_pads[0];
|
||||
const index_t InRightPadH = input_right_pads[1];
|
||||
const index_t InRightPadW = input_right_pads[2];
|
||||
|
||||
const auto in_n_di_hi_wi_c_grid_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
|
||||
|
||||
const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
|
||||
in_n_di_hi_wi_c_grid_desc,
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_pad_transform(Di, InLeftPadD, InRightPadD),
|
||||
make_pad_transform(Hi, InLeftPadH, InRightPadH),
|
||||
make_pad_transform(Wi, InLeftPadW, InRightPadW),
|
||||
make_pass_through_transform(C)),
|
||||
make_tuple(
|
||||
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(
|
||||
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
|
||||
|
||||
const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
|
||||
in_n_hip_wip_c_grid_desc,
|
||||
make_tuple(
|
||||
make_pass_through_transform(N),
|
||||
make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
|
||||
make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
|
||||
make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
|
||||
make_pass_through_transform(C)),
|
||||
make_tuple(
|
||||
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(Sequence<0>{},
|
||||
Sequence<1, 2>{},
|
||||
Sequence<3, 4>{},
|
||||
Sequence<5, 6>{},
|
||||
Sequence<7>{}));
|
||||
|
||||
const auto in_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor(
|
||||
in_n_z_do_y_ho_x_wo_c_grid_desc,
|
||||
make_tuple(make_merge_transform(make_tuple(Z, Y, X, C)),
|
||||
make_merge_transform(make_tuple(N, Do, Ho, Wo))),
|
||||
make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
|
||||
in_gemmk_gemmmraw_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(gemm_k0, GemmK1Number)),
|
||||
make_pass_through_transform(gemm_m)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
|
||||
|
||||
// in_gemmk0_gemmm_gemmk1_grid_desc
|
||||
return transform_tensor_descriptor(
|
||||
in_gemmk0_gemmmraw_gemmk1_grid_desc,
|
||||
make_tuple(make_pass_through_transform(gemm_k0),
|
||||
make_right_pad_transform(gemm_m, gemm_m_pad),
|
||||
make_pass_through_transform(GemmK1Number)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
|
||||
}
|
||||
}
|
||||
|
||||
static index_t GetGemmMRaw(ck::index_t N,
|
||||
const std::vector<ck::index_t>& output_spatial_lengths)
|
||||
{
|
||||
@@ -445,6 +594,13 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1});
|
||||
}
|
||||
|
||||
template <ck::index_t NDim, typename std::enable_if<NDim == 3, bool>::type = false>
|
||||
static auto GetABCGridDesc()
|
||||
{
|
||||
return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
|
||||
1, 1, 1, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1});
|
||||
}
|
||||
|
||||
using ABCGridDescs = decltype(GetABCGridDesc<NumDimSpatial>());
|
||||
|
||||
using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
|
||||
@@ -593,6 +749,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
<< ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
|
||||
@@ -605,7 +762,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
|
||||
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
|
||||
}
|
||||
|
||||
#endif
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
|
||||
arg.b_grid_desc_k0_n_k1_,
|
||||
arg.c_grid_desc_m_n_,
|
||||
@@ -704,6 +861,22 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
|
||||
static bool IsSupportedArgument(const Argument& arg)
|
||||
{
|
||||
// Input tensors can't be bigger than 2GB each.
|
||||
constexpr std::size_t GB2 = 2 * 1e9;
|
||||
|
||||
if(arg.a_grid_desc_k0_m_k1_.GetElementSpaceSize() > GB2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if(arg.b_grid_desc_k0_n_k1_.GetElementSpaceSize() > GB2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if(arg.c_grid_desc_m_n_.GetElementSpaceSize() > GB2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if constexpr(ConvForwardSpecialization ==
|
||||
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
|
||||
{
|
||||
@@ -851,7 +1024,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
<< BlockSize << ", "
|
||||
<< MPerBlock << ", "
|
||||
<< NPerBlock << ", "
|
||||
<< K0PerBlock
|
||||
<< K0PerBlock << ", "
|
||||
<< getConvFwdSpecializationStr(ConvForwardSpecialization)
|
||||
<< ">";
|
||||
// clang-format on
|
||||
|
||||
|
||||
@@ -85,6 +85,7 @@ struct NKHW : public BaseTensorLayout
|
||||
static constexpr const char* name = "NKHW";
|
||||
};
|
||||
|
||||
// 3D Conv
|
||||
struct NDHWC : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "NDHWC";
|
||||
@@ -100,6 +101,21 @@ struct NDHWK : public BaseTensorLayout
|
||||
static constexpr const char* name = "NDHWK";
|
||||
};
|
||||
|
||||
struct NCDHW : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "NCDHW";
|
||||
};
|
||||
|
||||
struct KCZYX : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "KCZYX";
|
||||
};
|
||||
|
||||
struct NKDHW : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "NKDHW";
|
||||
};
|
||||
|
||||
} // namespace convolution
|
||||
|
||||
template <
|
||||
|
||||
Reference in New Issue
Block a user