mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-02 21:27:45 +00:00
Revert "tmp save, trying processing full kperblock but it doesnt work"
This reverts commit 49a79b7ab4.
This commit is contained in:
@@ -11,8 +11,8 @@ struct StreamConfig
|
||||
hipStream_t stream_id_ = nullptr;
|
||||
bool time_kernel_ = false;
|
||||
int log_level_ = 0;
|
||||
int cold_niters_ = 0;//5;
|
||||
int nrepeat_ = 1;//50;
|
||||
int cold_niters_ = 5;
|
||||
int nrepeat_ = 50;
|
||||
|
||||
bool flush_cache = false;
|
||||
int rotating_count = 1;
|
||||
|
||||
@@ -1264,7 +1264,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
|
||||
{
|
||||
namespace ctc = tensor_layout::convolution;
|
||||
|
||||
const index_t W = arg.a_g_n_c_wis_lengths_[I4]; // assume 2d for now
|
||||
const index_t G = arg.b_g_k_c_xs_lengths_[I0];
|
||||
const index_t K = arg.b_g_k_c_xs_lengths_[I1];
|
||||
const index_t C = arg.b_g_k_c_xs_lengths_[I2];
|
||||
@@ -1352,7 +1351,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
|
||||
return false;
|
||||
}
|
||||
}
|
||||
//printf("checking a vector \n");
|
||||
|
||||
// check vector access of A
|
||||
// FIXME: layout
|
||||
@@ -1364,7 +1362,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
|
||||
is_same_v<ALayout, ctc::NGCHW> || is_same_v<ALayout, ctc::NGCDHW>)
|
||||
{
|
||||
// Check access per C
|
||||
if(!(ABlockTransferSrcVectorDim == 2 && W % ABlockTransferSrcScalarPerVector == 0)) // changed from C % to H%
|
||||
if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
|
||||
{
|
||||
// If not possible, check access per G
|
||||
if(!(ABlockTransferSrcVectorDim == 1 && (C == 1 || NumGroupsToMerge == 1) &&
|
||||
@@ -1373,17 +1371,15 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
|
||||
is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>()) &&
|
||||
G % ABlockTransferSrcScalarPerVector == 0))
|
||||
{
|
||||
printf("checking a vector failed \n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("checking a vector wtf \n");
|
||||
return false;
|
||||
}
|
||||
// printf("checking b vector \n");
|
||||
|
||||
// check vector access of B
|
||||
// FIXME: layout
|
||||
if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
|
||||
@@ -1452,30 +1448,30 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
|
||||
if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
|
||||
is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
|
||||
{
|
||||
// if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
// {
|
||||
// return false;
|
||||
// }
|
||||
if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
// {
|
||||
// return false;
|
||||
// }
|
||||
if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// const index_t input_spatial_acum = ck::accumulate_n<index_t>(
|
||||
// arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
|
||||
// const index_t output_spatial_acum = ck::accumulate_n<index_t>(
|
||||
// arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
|
||||
const index_t input_spatial_acum = ck::accumulate_n<index_t>(
|
||||
arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
|
||||
const index_t output_spatial_acum = ck::accumulate_n<index_t>(
|
||||
arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
|
||||
|
||||
// if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
// {
|
||||
// return false;
|
||||
// }
|
||||
if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
// {
|
||||
// return false;
|
||||
// }
|
||||
if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// if(!arg.p_workspace_)
|
||||
// {
|
||||
@@ -1501,7 +1497,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// printf("checking e vector \n");
|
||||
|
||||
// check vector access of E
|
||||
if constexpr(is_same_v<ELayout, ctc::G_NW_K> || is_same_v<ELayout, ctc::G_NHW_K> ||
|
||||
is_same_v<ELayout, ctc::G_NDHW_K> || is_same_v<ELayout, ctc::GNWK> ||
|
||||
@@ -1536,7 +1532,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
|
||||
}
|
||||
else
|
||||
{
|
||||
//printf("gonig into gemm validation\n");
|
||||
return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
|
||||
arg.b_grid_desc_n_k_,
|
||||
arg.ds_grid_desc_m_n_,
|
||||
|
||||
@@ -162,19 +162,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
make_tuple(NSlice, PaddedCSlice, HSlice, WSlice),
|
||||
make_tuple(PaddedCSlice * HSlice * WSlice, HSlice * WSlice, WSlice, I1));
|
||||
|
||||
// constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor(
|
||||
// nchw_slice_desc,
|
||||
// make_tuple(make_pass_through_transform(NSlice),
|
||||
// make_slice_transform(PaddedCSlice, I0, CSlice),
|
||||
// make_pass_through_transform(HSlice),
|
||||
// make_pass_through_transform(WSlice)),
|
||||
// make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
// make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor(
|
||||
constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor(
|
||||
nchw_slice_desc,
|
||||
make_tuple(make_pass_through_transform(NSlice),
|
||||
make_pass_through_transform(PaddedCSlice),
|
||||
make_slice_transform(PaddedCSlice, I0, CSlice),
|
||||
make_pass_through_transform(HSlice),
|
||||
make_pass_through_transform(WSlice)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor(
|
||||
nchw_slice_sliced_desc,
|
||||
make_tuple(make_pass_through_transform(NSlice),
|
||||
make_pass_through_transform(CSlice),
|
||||
make_pass_through_transform(HSlice),
|
||||
make_pad_transform(WSlice, InLeftPadW, InRightPadW)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
@@ -183,7 +183,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
constexpr auto n_c_y_ho_x_wo_desc = transform_tensor_descriptor(
|
||||
nchw_padded_slice_desc,
|
||||
make_tuple(make_pass_through_transform(NSlice),
|
||||
make_pass_through_transform(PaddedCSlice),
|
||||
make_pass_through_transform(CSlice),
|
||||
make_embed_transform(make_tuple(Y, HoutSlice),
|
||||
make_tuple(ConvDilationH, ConvStrideH)),
|
||||
make_embed_transform(make_tuple(X, WoutSlice),
|
||||
@@ -196,26 +196,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
constexpr auto mk_desc = transform_tensor_descriptor(
|
||||
n_c_y_ho_x_wo_desc,
|
||||
make_tuple(make_merge_transform(make_tuple(NSlice, HoutSlice, WoutSlice)),
|
||||
make_merge_transform(make_tuple(PaddedCSlice, Y, X))),
|
||||
make_merge_transform(make_tuple(CSlice, Y, X))),
|
||||
make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
constexpr auto mk_desc_sliced = transform_tensor_descriptor(
|
||||
constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock;
|
||||
|
||||
constexpr auto mk_pad_desc = transform_tensor_descriptor(
|
||||
mk_desc,
|
||||
make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice),
|
||||
make_slice_transform(PaddedCSlice * Y * X, I0, KPerBlock)),
|
||||
make_right_pad_transform(CSlice * Y * X, FinalMKPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
// constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock;
|
||||
|
||||
// constexpr auto mk_pad_desc = transform_tensor_descriptor(
|
||||
// mk_desc,
|
||||
// make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice),
|
||||
// make_right_pad_transform(PaddedCSlice * Y * X, FinalMKPad)),
|
||||
// make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
// make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
// if(threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
// printf("nchw pad slice%d\n", nchw_padded_slice_desc.GetElementSize());
|
||||
// printf("nhowocyx %d\n", nhowocyx_desc.GetElementSize());
|
||||
@@ -225,7 +218,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
|
||||
constexpr auto AK0 = KPerBlock / AK1;
|
||||
|
||||
return transform_tensor_descriptor(mk_desc_sliced,
|
||||
return transform_tensor_descriptor(mk_pad_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
|
||||
make_pass_through_transform(MPerBlock)),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}),
|
||||
@@ -240,16 +233,6 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
|
||||
}
|
||||
|
||||
// __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1_padded()
|
||||
// {
|
||||
// // B matrix in LDS memory, dst of blockwise copy
|
||||
// //constexpr BK0 = PaddedCSlice * X * Y;
|
||||
|
||||
// return make_naive_tensor_descriptor(
|
||||
// make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
|
||||
// make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
|
||||
// }
|
||||
|
||||
__host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
|
||||
{
|
||||
// B matrix in LDS memory, dst of blockwise copy
|
||||
@@ -533,7 +516,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
|
||||
__host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
|
||||
{
|
||||
const index_t num_loop = K / KPerBlock;
|
||||
const index_t num_loop = K / TrueKPerBlock;
|
||||
|
||||
return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
|
||||
}
|
||||
@@ -637,27 +620,6 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
|
||||
__device__ __host__ static constexpr auto GetMPerBlock() { return MPerBlock; }
|
||||
|
||||
template <typename AGridDesc_NCHW>
|
||||
__host__ __device__ static auto GetAGridDescriptor_NCHW_BlockSlice(const AGridDesc_NCHW& a_grid_desc_nchw, const index_t& m_block_idx)
|
||||
{
|
||||
|
||||
|
||||
const index_t Hout = PadH ? a_grid_desc_nchw.GetLength(I2) - Y + 1 : a_grid_desc_nchw.GetLength(I2);
|
||||
|
||||
//const index_t m_block_data_idx_on_grid_nchw_n = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) / Hout);
|
||||
//const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) % Hout);
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
a_grid_desc_nchw,
|
||||
make_tuple(make_pass_through_transform(a_grid_desc_nchw.GetLength(I0)),
|
||||
make_pass_through_transform(a_grid_desc_nchw.GetLength(I1)),
|
||||
make_slice_transform(a_grid_desc_nchw.GetLength(I2), (m_block_idx * HoutSlice) % Hout, HSlice),
|
||||
make_pass_through_transform(a_grid_desc_nchw.GetLength(I3))),
|
||||
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
}
|
||||
|
||||
template <bool HasMainKBlockLoop,
|
||||
typename AGridDesc_AK0_M_AK1,
|
||||
typename AGridDesc_NCHW,
|
||||
@@ -718,13 +680,11 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
const index_t Hout = PadH ? a_grid_desc_nchw.GetLength(I2) - Y + 1 : a_grid_desc_nchw.GetLength(I2);
|
||||
|
||||
const index_t m_block_data_idx_on_grid_nchw_n = __builtin_amdgcn_readfirstlane((block_work_idx[I0] * HoutSlice) / Hout);
|
||||
//const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((block_work_idx[I0] * HoutSlice) % Hout);
|
||||
const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((block_work_idx[I0] * HoutSlice) % Hout);
|
||||
|
||||
const index_t n_block_data_idx_on_grid =
|
||||
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
|
||||
|
||||
const auto a_grid_desc_nchw_block_slice = GetAGridDescriptor_NCHW_BlockSlice(a_grid_desc_nchw, block_work_idx[I0]);
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("BlockIdx.x %d, mblock idx %d n %d h%d nblock idx %d Hout %d", blockIdx.x, block_work_idx[I0],
|
||||
// m_block_data_idx_on_grid_nchw_n, m_block_data_idx_on_grid_nchw_h, n_block_data_idx_on_grid, Hout);
|
||||
@@ -769,7 +729,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
Sequence<0, 1, 2, 3>,//ABlockTransferThreadClusterArrangeOrder,
|
||||
ADataType,
|
||||
AComputeDataType,
|
||||
decltype(a_grid_desc_nchw_block_slice),
|
||||
decltype(a_grid_desc_nchw),
|
||||
decltype(a_block1_desc_cslice_hslice_wslice),
|
||||
Sequence<0, 1, 2, 3>,//ABlockTransferSrcAccessOrder,
|
||||
Sequence<0, 1, 2, 3>,//Sequence<1, 0, 2>,
|
||||
@@ -782,8 +742,8 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
AThreadTransferSrcResetCoordinateAfterRun,
|
||||
true,
|
||||
NumGemmKPrefetchStage>(
|
||||
a_grid_desc_nchw_block_slice,
|
||||
make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, 0, 0), // H slice is done inside of grid desc
|
||||
a_grid_desc_nchw,
|
||||
make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, m_block_data_idx_on_grid_nchw_h, 0),
|
||||
a_element_op,
|
||||
a_block1_desc_cslice_hslice_wslice,
|
||||
make_multi_index(0, 0, 0, 0),
|
||||
@@ -810,7 +770,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
ABlockTransferDstScalarPerVector_AK1,//ABlockTransferDstScalarPerVector_AK1,
|
||||
1,
|
||||
1,
|
||||
false,// was triue nad working before changes//AThreadTransferSrcResetCoordinateAfterRun,
|
||||
true,//AThreadTransferSrcResetCoordinateAfterRun,
|
||||
true,
|
||||
NumGemmKPrefetchStage>(
|
||||
a_block1_desc_nhowo_cyx,
|
||||
@@ -944,40 +904,9 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
b_block_desc_bk0_n_bk1.GetElementSpaceSize());
|
||||
|
||||
//constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
|
||||
//constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0); // make multiple steps
|
||||
// template <index_t N>
|
||||
// using MultiIndex = StaticallyIndexedArray<index_t, N>;
|
||||
[[maybe_unused]] StaticallyIndexedArray<index_t, 4> a_block1_slice_copy_steps[9] = {
|
||||
make_multi_index(0, CSlice, 1, 0),
|
||||
make_multi_index(0, CSlice, 2, 0),
|
||||
make_multi_index(0, CSlice, 2, 0),
|
||||
make_multi_index(0, CSlice, 1, 0),
|
||||
make_multi_index(0, CSlice, 2, 0),
|
||||
make_multi_index(0, CSlice, 2, 0),
|
||||
make_multi_index(0, CSlice, 1, 0),
|
||||
make_multi_index(0, CSlice, 2, 0),
|
||||
make_multi_index(0, CSlice, 2, 0)
|
||||
};
|
||||
|
||||
[[maybe_unused]] StaticallyIndexedArray<index_t, 3> a_block2_slice_copy_steps[9] = {
|
||||
make_multi_index(0, 0, 2), // 2
|
||||
make_multi_index(0, 0, -1), // -1
|
||||
make_multi_index(0, 0, -1), // -1
|
||||
make_multi_index(0, 0, 2), // 2
|
||||
make_multi_index(0, 0, -1), // -1
|
||||
make_multi_index(0, 0, -1), // -1
|
||||
make_multi_index(0, 0, 2), // 2
|
||||
make_multi_index(0, 0, -1), // -1
|
||||
make_multi_index(0, 0, -1) // -1
|
||||
};
|
||||
// constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 1, 0); // make multiple steps
|
||||
// constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps
|
||||
// constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps
|
||||
//constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 2); // as Y % mod(KPerBlock - CSlice*Y*X)
|
||||
//constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 1); //
|
||||
//constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 0);
|
||||
constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0);
|
||||
//constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
|
||||
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
|
||||
constexpr auto b_block_slice_copy_step = make_multi_index(TrueKPerBlock / BK1, 0, TrueKPerBlock % BK1);
|
||||
|
||||
// gridwise GEMM pipeline
|
||||
const auto gridwise_gemm_pipeline =
|
||||
@@ -985,14 +914,15 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
|
||||
const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
|
||||
(a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
|
||||
KPerBlock); // TrueKPerBlock ?
|
||||
TrueKPerBlock) + ((a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) %
|
||||
TrueKPerBlock != 0); // TrueKPerBlock ?
|
||||
|
||||
if(threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
printf("hasmainkblockloop: %d\n", HasMainKBlockLoop);
|
||||
printf("Running %d k block iters\n", num_k_block_main_loop);
|
||||
}
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("hasmainkblockloop: %d\n", HasMainKBlockLoop);
|
||||
// printf("Running %d k block iters", num_k_block_main_loop);
|
||||
// }
|
||||
|
||||
gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_nchw_block_slice, // global
|
||||
gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_nchw, // global
|
||||
a_block1_desc_cslice_hslice_wslice, // lds1 write
|
||||
a_block1_desc_nhowo_cyx, // lds1 read
|
||||
a_block2_desc_ak0_m_ak1, // lds2 write/read
|
||||
@@ -1001,8 +931,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
a_grid_buf,
|
||||
a_block_buf1,
|
||||
a_block_buf2,
|
||||
a_block1_slice_copy_steps,
|
||||
a_block2_slice_copy_steps,
|
||||
a_block_slice_copy_step,
|
||||
b_grid_desc_bk0_n_bk1,
|
||||
b_block_desc_bk0_n_bk1,
|
||||
b_blockwise_copy,
|
||||
@@ -1099,7 +1028,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
CShuffleDataType,
|
||||
decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
|
||||
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2), // MN
|
||||
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Sequence<CShuffleMXdlPerWavePerShuffle,
|
||||
CShuffleNXdlPerWavePerShuffle,
|
||||
|
||||
@@ -72,7 +72,7 @@ struct GridwiseGemmPipeline_v1<1, true, true>
|
||||
|
||||
// if(threadIdx.x == 0){
|
||||
// for(int m=0; m<128; ++m) {
|
||||
// for(int k0=0; k0<4; ++k0) {
|
||||
// for(int k0=0; k0<8; ++k0) {
|
||||
// for(int k1=0;k1<8;++k1) {
|
||||
// printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block_buf[k0*8*128 + m*8 + k1]));
|
||||
// }
|
||||
@@ -102,16 +102,6 @@ struct GridwiseGemmPipeline_v1<1, true, true>
|
||||
|
||||
a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
|
||||
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
|
||||
block_sync_lds();
|
||||
// if(threadIdx.x == 0){
|
||||
// for(int m=0; m<128; ++m) {
|
||||
// for(int k0=0; k0<4; ++k0) {
|
||||
// for(int k1=0;k1<8;++k1) {
|
||||
// printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block_buf[k0*8*128 + m*8 + k1]));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
++i;
|
||||
} while(i < (num_loop - 1));
|
||||
|
||||
@@ -36,8 +36,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
|
||||
typename AGridBuffer,
|
||||
typename ABlock1Buffer,
|
||||
typename ABlock2Buffer,
|
||||
typename ABlock1TransferSteps,
|
||||
typename ABlock2TransferSteps,
|
||||
typename ABlockTransferStep,
|
||||
typename BGridDesc,
|
||||
typename BBlockDesc,
|
||||
typename BBlockTransfer,
|
||||
@@ -55,8 +54,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
|
||||
[[maybe_unused]] const AGridBuffer& a_grid_buf,
|
||||
[[maybe_unused]] ABlock1Buffer& a_block1_buf,
|
||||
[[maybe_unused]] ABlock2Buffer& a_block2_buf,
|
||||
[[maybe_unused]] const ABlock1TransferSteps& a_block1_copy_steps,
|
||||
[[maybe_unused]] const ABlock2TransferSteps& a_block2_copy_steps,
|
||||
[[maybe_unused]] const ABlockTransferStep& a_block_copy_step,
|
||||
[[maybe_unused]] const BGridDesc& b_grid_desc,
|
||||
[[maybe_unused]] const BBlockDesc& b_block_desc,
|
||||
[[maybe_unused]] BBlockTransfer& b_blockwise_copy,
|
||||
@@ -71,7 +69,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
|
||||
a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf);
|
||||
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
|
||||
|
||||
a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[0]);
|
||||
a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
|
||||
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
|
||||
//block_sync_lds();
|
||||
a_blockwise_global_to_lds1_copy.RunWrite(a_block1_desc_nchw_slice, a_block1_buf);
|
||||
@@ -85,11 +83,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
|
||||
a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf);//a_block2_buf); ?? jaki bufor
|
||||
a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf);
|
||||
//block_sync_lds();
|
||||
a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[0]);
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("Next iter\n");
|
||||
// for(int c=0; c<5; ++c) {
|
||||
// for(int c=0; c<4; ++c) {
|
||||
// for(int h=0; h<2;++h) {
|
||||
// for(int w=0; w<64;++w) {
|
||||
// printf("Ag[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_grid_buf[c * 64 * 2 + h * 64 + w]));
|
||||
@@ -97,9 +93,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
|
||||
// }
|
||||
// }
|
||||
|
||||
// // // A[3][2][33]=0.000000
|
||||
// // A[3][2][33]=0.000000
|
||||
|
||||
// for(int c=0; c<4; ++c) {
|
||||
// for(int c=0; c<8; ++c) {
|
||||
// for(int h=0; h<4;++h) {
|
||||
// for(int w=0; w<64;++w) {
|
||||
// printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_block1_buf[c * 64 * 4 + h * 64 + w]));
|
||||
@@ -108,14 +104,14 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
|
||||
// }
|
||||
|
||||
// for(int m=0; m<128; ++m) {
|
||||
// for(int k0=0; k0<4; ++k0) {
|
||||
// for(int k0=0; k0<8; ++k0) {
|
||||
// for(int k1=0;k1<8;++k1) {
|
||||
// printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]));
|
||||
|
||||
// // int k = k0*8+k1;
|
||||
// // int x = k%3;
|
||||
// // int y = (k/3)%3;
|
||||
// // int c = k/(9);
|
||||
// int k = k0*8+k1;
|
||||
// int x = k%3;
|
||||
// int y = (k/3)%3;
|
||||
// int c = k/(9);
|
||||
|
||||
// // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka
|
||||
|
||||
@@ -146,7 +142,6 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
|
||||
do
|
||||
{ // ask bartek how to structure this pipeline
|
||||
a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf); // A Global -> VGPR
|
||||
block_sync_lds();
|
||||
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); // B Global -> VGPR
|
||||
//
|
||||
|
||||
@@ -156,53 +151,13 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
|
||||
block_sync_lds();
|
||||
a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf); // A LDS1 -> LDS2
|
||||
|
||||
a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[i+1]);
|
||||
a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[i+1]);
|
||||
a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
|
||||
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
|
||||
//block_sync_lds();
|
||||
|
||||
a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf);
|
||||
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
|
||||
// block_sync_lds(); // delete later
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("Next iter\n");
|
||||
|
||||
// // // A[3][2][33]=0.000000
|
||||
|
||||
// for(int c=0; c<4; ++c) {
|
||||
// for(int h=0; h<4;++h) {
|
||||
// for(int w=0; w<64;++w) {
|
||||
// printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_block1_buf[c * 64 * 4 + h * 64 + w]));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// for(int m=0; m<128; ++m) {
|
||||
// for(int k0=0; k0<4; ++k0) {
|
||||
// for(int k1=0;k1<8;++k1) {
|
||||
// printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]));
|
||||
|
||||
// // int k = k0*8+k1;
|
||||
// // int x = k%3;
|
||||
// // int y = (k/3)%3;
|
||||
// // int c = k/(9);
|
||||
|
||||
// // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka
|
||||
|
||||
// // if(x+m%64 > 0 && x+m%64 < 127) {
|
||||
// // auto lds1 = static_cast<float>(a_block1_buf[c * 64 * 4 + (m/64 + y) * 64 + m%64 + x - 1]);
|
||||
// // auto lds2 = static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]);
|
||||
// // printf("lds2[%d][%d]:%f lds1[%d][%d][%d]:%f %s\n", m, k0*8+k1, lds2, c, y + m/64, m%64 + x - 1, lds1, (lds1 > lds2)? "diff" : "");
|
||||
|
||||
// // }
|
||||
|
||||
// // if(lds1 > lds2) {
|
||||
// // printf("diff lds1[%d][%d][%d]:%f lds2[%d][%d]:%f\n", c, y + m/64, m%64 + x, lds1, m, k0*8+k1 - 1, lds2);
|
||||
// // }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//block_sync_lds();
|
||||
|
||||
++i;
|
||||
} while(i < (num_loop - 1));
|
||||
|
||||
@@ -1477,8 +1477,6 @@ struct TransformConvFwdToGemm
|
||||
}
|
||||
}
|
||||
|
||||
// NGCHW x KCYX
|
||||
// NHoWoCYX KCYX = NHoWoK
|
||||
|
||||
template <typename CLayout,
|
||||
index_t NDimSp = NDimSpatial,
|
||||
|
||||
@@ -59,35 +59,35 @@ using device_grouped_conv_fwd_preshuffle_xdl_f16_generic_instances = std::tuple<
|
||||
//####################################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| | | | | | | Lengths_N_C_H_W| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
|
||||
//####################################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
// generic instance
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 32, 8, 8, 16, 16, 8, 4, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 64, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 32, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 32, 8, 8, 32, 32, 4, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 32, 8, 8, 16, 16, 8, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 64, 8, 8, 16, 16, 8, 2, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 2>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 64, 8, 8, 16, 16, 8, 4, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 64, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 32, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 32, 8, 8, 32, 32, 4, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 32, 8, 8, 16, 16, 8, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 64, 8, 8, 16, 16, 8, 2, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 16, 16, 4, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 8, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 8, 16, 16, 4, 2, 3, 64, 3, 3, true, true, S<1, 8, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
// // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 6, 64, 3, 3, true, true, S<1, 4, 2, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, // best
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 8, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 8, 16, 16, 4, 2, 3, 64, 3, 3, true, true, S<1, 8, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 6, 64, 3, 3, true, true, S<1, 4, 2, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // best
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>, // best
|
||||
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 64, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 8, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // best
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 64, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 8, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8> // best
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 64, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 8, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>, // best
|
||||
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
|
||||
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>
|
||||
|
||||
|
||||
// clang-format on
|
||||
|
||||
@@ -127,7 +127,7 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
|
||||
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
|
||||
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
// // generic instance
|
||||
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 32, 8, 8, 16, 16, 8, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1>
|
||||
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 64, 8, 8, 16, 16, 8, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1>
|
||||
// DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 112, 64, 64, 8, 8, 16, 16, 7, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 112, 64, 64, 8, 8, 16, 16, 7, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
// DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 112, 64, 64, 8, 8, 16, 16, 7, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
|
||||
|
||||
Reference in New Issue
Block a user