Revert "tmp save, trying processing full kperblock but it doesnt work"

This reverts commit 49a79b7ab4.
This commit is contained in:
Jakub Piasecki
2025-05-23 12:46:37 +00:00
parent 49a79b7ab4
commit a503b3c237
8 changed files with 98 additions and 231 deletions

View File

@@ -11,8 +11,8 @@ struct StreamConfig
hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false;
int log_level_ = 0;
int cold_niters_ = 0;//5;
int nrepeat_ = 1;//50;
int cold_niters_ = 5;
int nrepeat_ = 50;
bool flush_cache = false;
int rotating_count = 1;

View File

@@ -1264,7 +1264,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
{
namespace ctc = tensor_layout::convolution;
const index_t W = arg.a_g_n_c_wis_lengths_[I4]; // assume 2d for now
const index_t G = arg.b_g_k_c_xs_lengths_[I0];
const index_t K = arg.b_g_k_c_xs_lengths_[I1];
const index_t C = arg.b_g_k_c_xs_lengths_[I2];
@@ -1352,7 +1351,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
return false;
}
}
//printf("checking a vector \n");
// check vector access of A
// FIXME: layout
@@ -1364,7 +1362,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
is_same_v<ALayout, ctc::NGCHW> || is_same_v<ALayout, ctc::NGCDHW>)
{
// Check access per C
if(!(ABlockTransferSrcVectorDim == 2 && W % ABlockTransferSrcScalarPerVector == 0)) // changed from C % to H%
if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
{
// If not possible, check access per G
if(!(ABlockTransferSrcVectorDim == 1 && (C == 1 || NumGroupsToMerge == 1) &&
@@ -1373,17 +1371,15 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>()) &&
G % ABlockTransferSrcScalarPerVector == 0))
{
printf("checking a vector failed \n");
return false;
}
}
}
else
{
printf("checking a vector wtf \n");
return false;
}
// printf("checking b vector \n");
// check vector access of B
// FIXME: layout
if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
@@ -1452,30 +1448,30 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
{
// if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
// {
// return false;
// }
if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
{
return false;
}
// if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
// {
// return false;
// }
if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
{
return false;
}
// const index_t input_spatial_acum = ck::accumulate_n<index_t>(
// arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
// const index_t output_spatial_acum = ck::accumulate_n<index_t>(
// arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
const index_t input_spatial_acum = ck::accumulate_n<index_t>(
arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
const index_t output_spatial_acum = ck::accumulate_n<index_t>(
arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
// if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
// {
// return false;
// }
if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
{
return false;
}
// if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
// {
// return false;
// }
if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
{
return false;
}
// if(!arg.p_workspace_)
// {
@@ -1501,7 +1497,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
{
return false;
}
// printf("checking e vector \n");
// check vector access of E
if constexpr(is_same_v<ELayout, ctc::G_NW_K> || is_same_v<ELayout, ctc::G_NHW_K> ||
is_same_v<ELayout, ctc::G_NDHW_K> || is_same_v<ELayout, ctc::GNWK> ||
@@ -1536,7 +1532,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
}
else
{
//printf("gonig into gemm validation\n");
return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
arg.b_grid_desc_n_k_,
arg.ds_grid_desc_m_n_,

View File

@@ -162,19 +162,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
make_tuple(NSlice, PaddedCSlice, HSlice, WSlice),
make_tuple(PaddedCSlice * HSlice * WSlice, HSlice * WSlice, WSlice, I1));
// constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor(
// nchw_slice_desc,
// make_tuple(make_pass_through_transform(NSlice),
// make_slice_transform(PaddedCSlice, I0, CSlice),
// make_pass_through_transform(HSlice),
// make_pass_through_transform(WSlice)),
// make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
// make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor(
constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor(
nchw_slice_desc,
make_tuple(make_pass_through_transform(NSlice),
make_pass_through_transform(PaddedCSlice),
make_slice_transform(PaddedCSlice, I0, CSlice),
make_pass_through_transform(HSlice),
make_pass_through_transform(WSlice)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor(
nchw_slice_sliced_desc,
make_tuple(make_pass_through_transform(NSlice),
make_pass_through_transform(CSlice),
make_pass_through_transform(HSlice),
make_pad_transform(WSlice, InLeftPadW, InRightPadW)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
@@ -183,7 +183,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
constexpr auto n_c_y_ho_x_wo_desc = transform_tensor_descriptor(
nchw_padded_slice_desc,
make_tuple(make_pass_through_transform(NSlice),
make_pass_through_transform(PaddedCSlice),
make_pass_through_transform(CSlice),
make_embed_transform(make_tuple(Y, HoutSlice),
make_tuple(ConvDilationH, ConvStrideH)),
make_embed_transform(make_tuple(X, WoutSlice),
@@ -196,26 +196,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
constexpr auto mk_desc = transform_tensor_descriptor(
n_c_y_ho_x_wo_desc,
make_tuple(make_merge_transform(make_tuple(NSlice, HoutSlice, WoutSlice)),
make_merge_transform(make_tuple(PaddedCSlice, Y, X))),
make_merge_transform(make_tuple(CSlice, Y, X))),
make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
constexpr auto mk_desc_sliced = transform_tensor_descriptor(
constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock;
constexpr auto mk_pad_desc = transform_tensor_descriptor(
mk_desc,
make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice),
make_slice_transform(PaddedCSlice * Y * X, I0, KPerBlock)),
make_right_pad_transform(CSlice * Y * X, FinalMKPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock;
// constexpr auto mk_pad_desc = transform_tensor_descriptor(
// mk_desc,
// make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice),
// make_right_pad_transform(PaddedCSlice * Y * X, FinalMKPad)),
// make_tuple(Sequence<0>{}, Sequence<1>{}),
// make_tuple(Sequence<0>{}, Sequence<1>{}));
// if(threadIdx.x == 0 && blockIdx.x == 0) {
// printf("nchw pad slice%d\n", nchw_padded_slice_desc.GetElementSize());
// printf("nhowocyx %d\n", nhowocyx_desc.GetElementSize());
@@ -225,7 +218,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
constexpr auto AK0 = KPerBlock / AK1;
return transform_tensor_descriptor(mk_desc_sliced,
return transform_tensor_descriptor(mk_pad_desc,
make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
make_pass_through_transform(MPerBlock)),
make_tuple(Sequence<1>{}, Sequence<0>{}),
@@ -240,16 +233,6 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
}
// __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1_padded()
// {
// // B matrix in LDS memory, dst of blockwise copy
// //constexpr BK0 = PaddedCSlice * X * Y;
// return make_naive_tensor_descriptor(
// make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
// make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
// }
__host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
{
// B matrix in LDS memory, dst of blockwise copy
@@ -533,7 +516,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
__host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
{
const index_t num_loop = K / KPerBlock;
const index_t num_loop = K / TrueKPerBlock;
return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
}
@@ -637,27 +620,6 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
__device__ __host__ static constexpr auto GetMPerBlock() { return MPerBlock; }
template <typename AGridDesc_NCHW>
__host__ __device__ static auto GetAGridDescriptor_NCHW_BlockSlice(const AGridDesc_NCHW& a_grid_desc_nchw, const index_t& m_block_idx)
{
const index_t Hout = PadH ? a_grid_desc_nchw.GetLength(I2) - Y + 1 : a_grid_desc_nchw.GetLength(I2);
//const index_t m_block_data_idx_on_grid_nchw_n = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) / Hout);
//const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) % Hout);
return transform_tensor_descriptor(
a_grid_desc_nchw,
make_tuple(make_pass_through_transform(a_grid_desc_nchw.GetLength(I0)),
make_pass_through_transform(a_grid_desc_nchw.GetLength(I1)),
make_slice_transform(a_grid_desc_nchw.GetLength(I2), (m_block_idx * HoutSlice) % Hout, HSlice),
make_pass_through_transform(a_grid_desc_nchw.GetLength(I3))),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
}
template <bool HasMainKBlockLoop,
typename AGridDesc_AK0_M_AK1,
typename AGridDesc_NCHW,
@@ -718,13 +680,11 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
const index_t Hout = PadH ? a_grid_desc_nchw.GetLength(I2) - Y + 1 : a_grid_desc_nchw.GetLength(I2);
const index_t m_block_data_idx_on_grid_nchw_n = __builtin_amdgcn_readfirstlane((block_work_idx[I0] * HoutSlice) / Hout);
//const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((block_work_idx[I0] * HoutSlice) % Hout);
const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((block_work_idx[I0] * HoutSlice) % Hout);
const index_t n_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
const auto a_grid_desc_nchw_block_slice = GetAGridDescriptor_NCHW_BlockSlice(a_grid_desc_nchw, block_work_idx[I0]);
// if(threadIdx.x == 0) {
// printf("BlockIdx.x %d, mblock idx %d n %d h%d nblock idx %d Hout %d", blockIdx.x, block_work_idx[I0],
// m_block_data_idx_on_grid_nchw_n, m_block_data_idx_on_grid_nchw_h, n_block_data_idx_on_grid, Hout);
@@ -769,7 +729,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
Sequence<0, 1, 2, 3>,//ABlockTransferThreadClusterArrangeOrder,
ADataType,
AComputeDataType,
decltype(a_grid_desc_nchw_block_slice),
decltype(a_grid_desc_nchw),
decltype(a_block1_desc_cslice_hslice_wslice),
Sequence<0, 1, 2, 3>,//ABlockTransferSrcAccessOrder,
Sequence<0, 1, 2, 3>,//Sequence<1, 0, 2>,
@@ -782,8 +742,8 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
AThreadTransferSrcResetCoordinateAfterRun,
true,
NumGemmKPrefetchStage>(
a_grid_desc_nchw_block_slice,
make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, 0, 0), // H slice is done inside of grid desc
a_grid_desc_nchw,
make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, m_block_data_idx_on_grid_nchw_h, 0),
a_element_op,
a_block1_desc_cslice_hslice_wslice,
make_multi_index(0, 0, 0, 0),
@@ -810,7 +770,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
ABlockTransferDstScalarPerVector_AK1,//ABlockTransferDstScalarPerVector_AK1,
1,
1,
false,// was triue nad working before changes//AThreadTransferSrcResetCoordinateAfterRun,
true,//AThreadTransferSrcResetCoordinateAfterRun,
true,
NumGemmKPrefetchStage>(
a_block1_desc_nhowo_cyx,
@@ -944,40 +904,9 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
b_block_desc_bk0_n_bk1.GetElementSpaceSize());
//constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
//constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0); // make multiple steps
// template <index_t N>
// using MultiIndex = StaticallyIndexedArray<index_t, N>;
[[maybe_unused]] StaticallyIndexedArray<index_t, 4> a_block1_slice_copy_steps[9] = {
make_multi_index(0, CSlice, 1, 0),
make_multi_index(0, CSlice, 2, 0),
make_multi_index(0, CSlice, 2, 0),
make_multi_index(0, CSlice, 1, 0),
make_multi_index(0, CSlice, 2, 0),
make_multi_index(0, CSlice, 2, 0),
make_multi_index(0, CSlice, 1, 0),
make_multi_index(0, CSlice, 2, 0),
make_multi_index(0, CSlice, 2, 0)
};
[[maybe_unused]] StaticallyIndexedArray<index_t, 3> a_block2_slice_copy_steps[9] = {
make_multi_index(0, 0, 2), // 2
make_multi_index(0, 0, -1), // -1
make_multi_index(0, 0, -1), // -1
make_multi_index(0, 0, 2), // 2
make_multi_index(0, 0, -1), // -1
make_multi_index(0, 0, -1), // -1
make_multi_index(0, 0, 2), // 2
make_multi_index(0, 0, -1), // -1
make_multi_index(0, 0, -1) // -1
};
// constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 1, 0); // make multiple steps
// constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps
// constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps
//constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 2); // as Y % mod(KPerBlock - CSlice*Y*X)
//constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 1); //
//constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 0);
constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0);
//constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(TrueKPerBlock / BK1, 0, TrueKPerBlock % BK1);
// gridwise GEMM pipeline
const auto gridwise_gemm_pipeline =
@@ -985,14 +914,15 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
(a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
KPerBlock); // TrueKPerBlock ?
TrueKPerBlock) + ((a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) %
TrueKPerBlock != 0); // TrueKPerBlock ?
if(threadIdx.x == 0 && blockIdx.x == 0) {
printf("hasmainkblockloop: %d\n", HasMainKBlockLoop);
printf("Running %d k block iters\n", num_k_block_main_loop);
}
// if(threadIdx.x == 0) {
// printf("hasmainkblockloop: %d\n", HasMainKBlockLoop);
// printf("Running %d k block iters", num_k_block_main_loop);
// }
gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_nchw_block_slice, // global
gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_nchw, // global
a_block1_desc_cslice_hslice_wslice, // lds1 write
a_block1_desc_nhowo_cyx, // lds1 read
a_block2_desc_ak0_m_ak1, // lds2 write/read
@@ -1001,8 +931,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
a_grid_buf,
a_block_buf1,
a_block_buf2,
a_block1_slice_copy_steps,
a_block2_slice_copy_steps,
a_block_slice_copy_step,
b_grid_desc_bk0_n_bk1,
b_block_desc_bk0_n_bk1,
b_blockwise_copy,
@@ -1099,7 +1028,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
CShuffleDataType,
decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2), // MN
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
ck::tensor_operation::element_wise::PassThrough,
Sequence<CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,

View File

@@ -72,7 +72,7 @@ struct GridwiseGemmPipeline_v1<1, true, true>
// if(threadIdx.x == 0){
// for(int m=0; m<128; ++m) {
// for(int k0=0; k0<4; ++k0) {
// for(int k0=0; k0<8; ++k0) {
// for(int k1=0;k1<8;++k1) {
// printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block_buf[k0*8*128 + m*8 + k1]));
// }
@@ -102,16 +102,6 @@ struct GridwiseGemmPipeline_v1<1, true, true>
a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
block_sync_lds();
// if(threadIdx.x == 0){
// for(int m=0; m<128; ++m) {
// for(int k0=0; k0<4; ++k0) {
// for(int k1=0;k1<8;++k1) {
// printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block_buf[k0*8*128 + m*8 + k1]));
// }
// }
// }
// }
++i;
} while(i < (num_loop - 1));

View File

@@ -36,8 +36,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
typename AGridBuffer,
typename ABlock1Buffer,
typename ABlock2Buffer,
typename ABlock1TransferSteps,
typename ABlock2TransferSteps,
typename ABlockTransferStep,
typename BGridDesc,
typename BBlockDesc,
typename BBlockTransfer,
@@ -55,8 +54,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
[[maybe_unused]] const AGridBuffer& a_grid_buf,
[[maybe_unused]] ABlock1Buffer& a_block1_buf,
[[maybe_unused]] ABlock2Buffer& a_block2_buf,
[[maybe_unused]] const ABlock1TransferSteps& a_block1_copy_steps,
[[maybe_unused]] const ABlock2TransferSteps& a_block2_copy_steps,
[[maybe_unused]] const ABlockTransferStep& a_block_copy_step,
[[maybe_unused]] const BGridDesc& b_grid_desc,
[[maybe_unused]] const BBlockDesc& b_block_desc,
[[maybe_unused]] BBlockTransfer& b_blockwise_copy,
@@ -71,7 +69,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf);
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[0]);
a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
//block_sync_lds();
a_blockwise_global_to_lds1_copy.RunWrite(a_block1_desc_nchw_slice, a_block1_buf);
@@ -85,11 +83,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf);//a_block2_buf); ?? jaki bufor
a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf);
//block_sync_lds();
a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[0]);
// if(threadIdx.x == 0) {
// printf("Next iter\n");
// for(int c=0; c<5; ++c) {
// for(int c=0; c<4; ++c) {
// for(int h=0; h<2;++h) {
// for(int w=0; w<64;++w) {
// printf("Ag[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_grid_buf[c * 64 * 2 + h * 64 + w]));
@@ -97,9 +93,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
// }
// }
// // // A[3][2][33]=0.000000
// // A[3][2][33]=0.000000
// for(int c=0; c<4; ++c) {
// for(int c=0; c<8; ++c) {
// for(int h=0; h<4;++h) {
// for(int w=0; w<64;++w) {
// printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_block1_buf[c * 64 * 4 + h * 64 + w]));
@@ -108,14 +104,14 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
// }
// for(int m=0; m<128; ++m) {
// for(int k0=0; k0<4; ++k0) {
// for(int k0=0; k0<8; ++k0) {
// for(int k1=0;k1<8;++k1) {
// printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]));
// // int k = k0*8+k1;
// // int x = k%3;
// // int y = (k/3)%3;
// // int c = k/(9);
// int k = k0*8+k1;
// int x = k%3;
// int y = (k/3)%3;
// int c = k/(9);
// // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka
@@ -146,7 +142,6 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
do
{ // ask bartek how to structure this pipeline
a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf); // A Global -> VGPR
block_sync_lds();
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); // B Global -> VGPR
//
@@ -156,53 +151,13 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
block_sync_lds();
a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf); // A LDS1 -> LDS2
a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[i+1]);
a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[i+1]);
a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
//block_sync_lds();
a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf);
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
// block_sync_lds(); // delete later
// if(threadIdx.x == 0) {
// printf("Next iter\n");
// // // A[3][2][33]=0.000000
// for(int c=0; c<4; ++c) {
// for(int h=0; h<4;++h) {
// for(int w=0; w<64;++w) {
// printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_block1_buf[c * 64 * 4 + h * 64 + w]));
// }
// }
// }
// for(int m=0; m<128; ++m) {
// for(int k0=0; k0<4; ++k0) {
// for(int k1=0;k1<8;++k1) {
// printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]));
// // int k = k0*8+k1;
// // int x = k%3;
// // int y = (k/3)%3;
// // int c = k/(9);
// // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka
// // if(x+m%64 > 0 && x+m%64 < 127) {
// // auto lds1 = static_cast<float>(a_block1_buf[c * 64 * 4 + (m/64 + y) * 64 + m%64 + x - 1]);
// // auto lds2 = static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]);
// // printf("lds2[%d][%d]:%f lds1[%d][%d][%d]:%f %s\n", m, k0*8+k1, lds2, c, y + m/64, m%64 + x - 1, lds1, (lds1 > lds2)? "diff" : "");
// // }
// // if(lds1 > lds2) {
// // printf("diff lds1[%d][%d][%d]:%f lds2[%d][%d]:%f\n", c, y + m/64, m%64 + x, lds1, m, k0*8+k1 - 1, lds2);
// // }
// }
// }
// }
// }
//block_sync_lds();
++i;
} while(i < (num_loop - 1));

View File

@@ -1477,8 +1477,6 @@ struct TransformConvFwdToGemm
}
}
// NGCHW x KCYX
// NHoWoCYX KCYX = NHoWoK
template <typename CLayout,
index_t NDimSp = NDimSpatial,

View File

@@ -59,35 +59,35 @@ using device_grouped_conv_fwd_preshuffle_xdl_f16_generic_instances = std::tuple<
//####################################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| | | | | | | Lengths_N_C_H_W| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//####################################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 32, 8, 8, 16, 16, 8, 4, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 64, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 32, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 32, 8, 8, 32, 32, 4, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 32, 8, 8, 16, 16, 8, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 64, 8, 8, 16, 16, 8, 2, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 2>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 64, 8, 8, 16, 16, 8, 4, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 64, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 32, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 32, 8, 8, 32, 32, 4, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 32, 8, 8, 16, 16, 8, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 32, 64, 8, 8, 16, 16, 8, 2, 4, 64, 3, 3, true, true, S<1, 8, 4, 2>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 16, 16, 4, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 8, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 8, 16, 16, 4, 2, 3, 64, 3, 3, true, true, S<1, 8, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
// // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 6, 64, 3, 3, true, true, S<1, 4, 2, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, // best
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 8, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 8, 16, 16, 4, 2, 3, 64, 3, 3, true, true, S<1, 8, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 6, 64, 3, 3, true, true, S<1, 4, 2, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // best
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>, // best
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 64, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 8, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // best
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 64, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 8, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8> // best
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 64, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 8, 4, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>, // best
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, 3, 64, 3, 3, true, true, S<1, 4, 1, 8>, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>,
DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, 4, 64, 3, 3, true, true, S<1, 4, 4, 4>, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>
// clang-format on

View File

@@ -127,7 +127,7 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// // generic instance
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 32, 8, 8, 16, 16, 8, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 128, 64, 64, 8, 8, 16, 16, 8, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1>
// DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 112, 64, 64, 8, 8, 16, 16, 7, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 112, 64, 64, 8, 8, 16, 16, 7, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,
// DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout, DsLayout,ELayout, F16, F16, F32, F16, DsLayout, F16, PassThrough, PassThrough, PassThrough, ConvSpec, GemmMNKPadding, 1, 64, 112, 64, 64, 8, 8, 16, 16, 7, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,