mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 11:47:48 +00:00
[CK_TILE] Enable vector stores for C Column Layout part1
This commit is contained in:
@@ -615,9 +615,9 @@ struct buffer_view<address_space_enum::global,
|
||||
using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
|
||||
|
||||
// X contains multiple T
|
||||
constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
|
||||
constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size; // 1
|
||||
|
||||
constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
|
||||
constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size; // 8
|
||||
|
||||
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
|
||||
"wrong! X should contain multiple T");
|
||||
@@ -650,7 +650,15 @@ struct buffer_view<address_space_enum::global,
|
||||
bool constexpr use_amd_buffer_addressing = false;
|
||||
#endif
|
||||
|
||||
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
|
||||
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; // 8
|
||||
//static_assert(scalar_per_x_vector > scalar_per_t_vector, "Condition not met: (( scalar_per_x_vector > scalar_per_t_vector ))");
|
||||
//if(threadIdx.x == 0)
|
||||
//{
|
||||
// printf("[DEBUG]: BufferView: t_per_x: %d\n",t_per_x);
|
||||
// printf("[DEBUG]: BufferView: scalar_per_x_vector: %d\n",scalar_per_x_vector);
|
||||
// printf("[DEBUG]: BufferView: scalar_per_t_vector: %d\n",scalar_per_t_vector);
|
||||
// printf("[DEBUG]: BufferView: x.size(): %d\n",x.size());
|
||||
//}
|
||||
|
||||
if constexpr(use_amd_buffer_addressing)
|
||||
{
|
||||
|
||||
@@ -65,6 +65,24 @@ struct CShuffleEpilogueProblem
|
||||
|
||||
static_assert(NumDTensor == DsLayout::size(),
|
||||
"The size of DsDataType and DsLayout should be the same");
|
||||
|
||||
CK_TILE_HOST static void PrintInfo() {
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: kBlockSize: %d\n",kBlockSize);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: kMPerBlock: %d\n",kMPerBlock);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: kNPerBlock: %d\n",kNPerBlock);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: MWave: %d\n",MWave);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: NWave: %d\n",NWave);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: MPerXdl: %d\n",MPerXdl);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: NPerXdl: %d\n",NPerXdl);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: KPerXdl: %d\n",KPerXdl);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: isCTransposed: %d\n",isCTransposed);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: MemoryOperation: %d\n",static_cast<int>(MemoryOperation));
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: FixedVectorSize: %d\n",static_cast<int>(FixedVectorSize));
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: VectorSizeC: %d\n",VectorSizeC);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: TiledMMAPermuteN: %d\n",static_cast<int>(TiledMMAPermuteN));
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: kNumWaveGroups: %d\n",kNumWaveGroups);
|
||||
printf("[DEBUG]: CShuffleEpilogueProblem: NumDTensor: %d\n",NumDTensor);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Problem_, typename Policy_ = void>
|
||||
@@ -122,6 +140,19 @@ struct CShuffleEpilogue
|
||||
CDElementwise elfunc_;
|
||||
|
||||
CK_TILE_DEVICE CShuffleEpilogue(CDElementwise elfunc = CDElementwise{}) : elfunc_(elfunc) {};
|
||||
static constexpr bool IsERowMajor =
|
||||
std::is_same_v<ELayout, tensor_layout::gemm::RowMajor> ? true : false;
|
||||
|
||||
CK_TILE_HOST static void PrintInfo() {
|
||||
printf("[DEBUG]: CShuffleEpilogue: MPerIteration: %d\n",MPerIteration);
|
||||
printf("[DEBUG]: CShuffleEpilogue: NPerIteration: %d\n",NPerIteration);
|
||||
printf("[DEBUG]: CShuffleEpilogue: MRepeat: %d\n",MRepeat);
|
||||
printf("[DEBUG]: CShuffleEpilogue: NRepeat: %d\n",NRepeat);
|
||||
printf("[DEBUG]: CShuffleEpilogue: GetVectorSizeC: %d\n",GetVectorSizeC());
|
||||
printf("[DEBUG]: CShuffleEpilogue: get_warp_size: %d\n",get_warp_size());
|
||||
printf("[DEBUG]: CShuffleEpilogue: MPerIterationShuffle: %d\n",MPerIterationShuffle);
|
||||
printf("[DEBUG]: CShuffleEpilogue: NPerIterationShuffle: %d\n",NPerIterationShuffle);
|
||||
}
|
||||
|
||||
static_assert(NumDTensor == DsLayout::size(),
|
||||
"The size of DsDataType and DsLayout should be the same");
|
||||
@@ -248,6 +279,22 @@ struct CShuffleEpilogue
|
||||
static constexpr index_t MPerIterationShuffle = std::get<0>(MNPerIterationShuffle);
|
||||
static constexpr index_t NPerIterationShuffle = std::get<1>(MNPerIterationShuffle);
|
||||
|
||||
static constexpr index_t NumYXdlPerWavePerShuffle =
|
||||
IsERowMajor ? NumMXdlPerWavePerShuffle : NumNXdlPerWavePerShuffle;
|
||||
static constexpr index_t NumXXdlPerWavePerShuffle =
|
||||
IsERowMajor ? NumNXdlPerWavePerShuffle : NumMXdlPerWavePerShuffle;
|
||||
|
||||
static constexpr index_t YPerIterationShuffle =
|
||||
IsERowMajor ? MPerIterationShuffle : NPerIterationShuffle;
|
||||
static constexpr index_t XPerIterationShuffle =
|
||||
IsERowMajor ? NPerIterationShuffle : MPerIterationShuffle;
|
||||
|
||||
static constexpr index_t YPerBlock = IsERowMajor ? kMPerBlock : kNPerBlock;
|
||||
static constexpr index_t XPerBlock = IsERowMajor ? kNPerBlock : kMPerBlock;
|
||||
|
||||
static constexpr index_t YWave = IsERowMajor ? MWave : NWave;
|
||||
static constexpr index_t XWave = IsERowMajor ? NWave : MWave;
|
||||
|
||||
using WG = WarpGemmDispatcher<ATypeToUse,
|
||||
BTypeToUse,
|
||||
AccDataType,
|
||||
@@ -277,8 +324,8 @@ struct CShuffleEpilogue
|
||||
else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
|
||||
{
|
||||
return make_naive_tensor_descriptor(
|
||||
make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
|
||||
make_tuple(number<1>{}, number<MPerIterationShuffle>{}));
|
||||
make_tuple(number<NPerIterationShuffle>{}, number<MPerIterationShuffle>{}),
|
||||
make_tuple(number<MPerIterationShuffle>{}, number<1>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -292,8 +339,8 @@ struct CShuffleEpilogue
|
||||
if constexpr(BlockedXDLN_PerWarp == 1)
|
||||
{
|
||||
return tile_distribution_encoding<sequence<>,
|
||||
tuple<sequence<NumMXdlPerWavePerShuffle, MWave>,
|
||||
sequence<NumNXdlPerWavePerShuffle, NWave>>,
|
||||
tuple<sequence<NumYXdlPerWavePerShuffle, YWave>,
|
||||
sequence<NumXXdlPerWavePerShuffle, XWave>>,
|
||||
tuple<sequence<1, 2>>,
|
||||
tuple<sequence<1, 1>>,
|
||||
sequence<1, 2>,
|
||||
@@ -301,12 +348,12 @@ struct CShuffleEpilogue
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr int RakedXDLN_PerWarp = NumNXdlPerWavePerShuffle / BlockedXDLN_PerWarp;
|
||||
constexpr int RakedXDLN_PerWarp = NumXXdlPerWavePerShuffle / BlockedXDLN_PerWarp;
|
||||
// BlockedLayout
|
||||
return tile_distribution_encoding<
|
||||
sequence<>,
|
||||
tuple<sequence<NumMXdlPerWavePerShuffle, MWave>,
|
||||
sequence<RakedXDLN_PerWarp, NWave, BlockedXDLN_PerWarp>>,
|
||||
tuple<sequence<NumYXdlPerWavePerShuffle, YWave>,
|
||||
sequence<RakedXDLN_PerWarp, XWave, BlockedXDLN_PerWarp>>,
|
||||
tuple<sequence<1, 2>>,
|
||||
tuple<sequence<1, 1>>,
|
||||
sequence<1, 2, 2>,
|
||||
@@ -622,22 +669,65 @@ struct CShuffleEpilogue
|
||||
auto o_lds_block = make_tensor_view<address_space_enum::lds>(
|
||||
static_cast<ODataType*>(p_smem), lds_block_desc);
|
||||
|
||||
auto in_lds_window = make_tile_window(
|
||||
o_lds_block,
|
||||
make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
|
||||
{0, 0},
|
||||
LdsTileDistr);
|
||||
auto in_lds_window = [&o_lds_block, &LdsTileDistr] {
|
||||
if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return make_tile_window(
|
||||
o_lds_block,
|
||||
make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
|
||||
{0, 0},
|
||||
LdsTileDistr);
|
||||
}
|
||||
else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
|
||||
{
|
||||
return make_tile_window(
|
||||
o_lds_block,
|
||||
make_tuple(number<NPerIterationShuffle>{}, number<MPerIterationShuffle>{}),
|
||||
{0, 0},
|
||||
LdsTileDistr);
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(false, "Unsupported ELayout!");
|
||||
}
|
||||
}();
|
||||
//auto in_lds_window = make_tile_window(
|
||||
// o_lds_block,
|
||||
// make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
|
||||
// {0, 0},
|
||||
// LdsTileDistr);
|
||||
|
||||
auto out_lds_window = make_tile_window(
|
||||
o_lds_block,
|
||||
make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
|
||||
{0, 0});
|
||||
//auto out_lds_window = make_tile_window(
|
||||
// o_lds_block,
|
||||
// make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
|
||||
// {0, 0});
|
||||
|
||||
auto out_lds_window = [&o_lds_block] {
|
||||
if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return make_tile_window(
|
||||
o_lds_block,
|
||||
make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
|
||||
{0, 0});
|
||||
}
|
||||
else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
|
||||
{
|
||||
return make_tile_window(
|
||||
o_lds_block,
|
||||
make_tuple(number<NPerIterationShuffle>{}, number<MPerIterationShuffle>{}),
|
||||
{0, 0});
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(false, "Unsupported ELayout!");
|
||||
}
|
||||
}();
|
||||
|
||||
constexpr index_t num_access = SFC::get_num_of_access();
|
||||
|
||||
static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
|
||||
"Currently, the CShuffle Epilogue only supports the Row Major Output layout");
|
||||
|
||||
// TODO: Add support for Col Major Output Layout - CShuffle Epilogue
|
||||
//static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
|
||||
// "Currently, the CShuffle Epilogue only supports the Row Major Output layout");
|
||||
static_assert(GetVectorSizeC() > 1, "VectorSizeC is not greater than 1!");
|
||||
using TileEncodingPattern =
|
||||
tile_distribution_encoding_pattern_2d<kBlockSize,
|
||||
MPerIterationShuffle,
|
||||
|
||||
@@ -740,9 +740,9 @@ struct UniversalGemmKernel
|
||||
{
|
||||
return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
|
||||
e_ptr,
|
||||
make_tuple(kargs.M, kargs.N),
|
||||
make_tuple(1, kargs.stride_E),
|
||||
number<1>{},
|
||||
make_tuple(kargs.N, kargs.M),
|
||||
make_tuple(kargs.stride_E, 1),
|
||||
number<EpiloguePipeline::GetVectorSizeC()>{},
|
||||
number<1>{});
|
||||
}
|
||||
}();
|
||||
@@ -831,9 +831,9 @@ struct UniversalGemmKernel
|
||||
else
|
||||
{
|
||||
return pad_tensor_view(e_tensor_view,
|
||||
make_tuple(number<TilePartitioner::MPerBlock>{},
|
||||
number<TilePartitioner::NPerBlock>{}),
|
||||
sequence<GemmPipeline::kPadM, false>{});
|
||||
make_tuple(number<TilePartitioner::NPerBlock>{},
|
||||
number<TilePartitioner::MPerBlock>{}),
|
||||
sequence<false, GemmPipeline::kPadM>{});
|
||||
}
|
||||
}();
|
||||
|
||||
@@ -929,10 +929,25 @@ struct UniversalGemmKernel
|
||||
},
|
||||
number<NumDTensor>{});
|
||||
|
||||
auto e_block_window = make_tile_window(
|
||||
e_pad_view,
|
||||
make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
|
||||
{i_m, i_n});
|
||||
|
||||
|
||||
const auto e_block_window = [&] () {
|
||||
if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return make_tile_window(e_pad_view,
|
||||
make_tuple(number<TilePartitioner::MPerBlock>{},
|
||||
number<TilePartitioner::NPerBlock>{}),
|
||||
{i_m, i_n});
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_tile_window(e_pad_view,
|
||||
make_tuple(number<TilePartitioner::NPerBlock>{},
|
||||
number<TilePartitioner::MPerBlock>{}),
|
||||
{i_n, i_m});
|
||||
}
|
||||
}();
|
||||
|
||||
|
||||
return make_tuple(as_block_window, bs_block_window, ds_block_window, e_block_window);
|
||||
}
|
||||
@@ -986,7 +1001,19 @@ struct UniversalGemmKernel
|
||||
// Run Epilogue Pipeline
|
||||
auto& c_block_window = gemm_tile_windows.at(I3);
|
||||
|
||||
//if(threadIdx.x == 0)
|
||||
//{
|
||||
// printf("CShuffleEpilogue operator() called! Before\n");
|
||||
// c_block_window.template print_tile_window_range<EDataType>(0, 4, 0, 8, "A");
|
||||
//}
|
||||
|
||||
EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
|
||||
|
||||
//if(threadIdx.x == 0)
|
||||
//{
|
||||
// printf("CShuffleEpilogue operator() called! After\n");
|
||||
// c_block_window.template print_tile_window_range<EDataType>(0, 4, 0, 8, "A");
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -78,46 +78,46 @@ using KernelTypesMemWmma = ::testing::Types<
|
||||
>;
|
||||
|
||||
using KernelTypesCompV3 = ::testing::Types<
|
||||
std::tuple< Row, Row, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Row, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Row, Col, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Row, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
std::tuple< Col, Col, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>
|
||||
std::tuple< Row, Row, Col, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>
|
||||
//std::tuple< Row, Row, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Row, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Row, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Row, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Row, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Row, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Row, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Row, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Row, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Row, Col, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Row, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>,
|
||||
//std::tuple< Col, Col, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>
|
||||
>;
|
||||
|
||||
using KernelTypesCompV3Wmma = ::testing::Types<
|
||||
|
||||
361
test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
Normal file
361
test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
Normal file
@@ -0,0 +1,361 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#pragma once
|
||||
|
||||
template <typename GemmConfig,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename DsDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename DsLayout,
|
||||
typename ELayout,
|
||||
bool Persistent,
|
||||
typename CDEElementWise>
|
||||
float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
|
||||
|
||||
{
|
||||
using GemmShape = ck_tile::TileGemmShape<
|
||||
ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
|
||||
ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
|
||||
ck_tile::
|
||||
sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
|
||||
GemmConfig::PermuteA,
|
||||
GemmConfig::PermuteB>;
|
||||
|
||||
using TilePartitioner =
|
||||
ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
|
||||
GemmConfig::TileParitionerGroupNum,
|
||||
GemmConfig::TileParitionerM01>;
|
||||
|
||||
using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
|
||||
GemmConfig::kPadN,
|
||||
GemmConfig::kPadK,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ELayout,
|
||||
GemmConfig::NumWaveGroups>;
|
||||
|
||||
using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
|
||||
GemmConfig::kPadN,
|
||||
GemmConfig::kPadK,
|
||||
GemmConfig::DoubleSmemBuffer,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ELayout,
|
||||
GemmConfig::TransposeC,
|
||||
GemmConfig::UseStructuredSparsity,
|
||||
Persistent,
|
||||
GemmConfig::NumWaveGroups>;
|
||||
using GemmPipelineProblem =
|
||||
ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
|
||||
|
||||
using BaseGemmPipeline = typename PipelineTypeTraits<
|
||||
GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
|
||||
|
||||
const ck_tile::index_t k_grain = args.k_batch * GemmConfig::K_Tile;
|
||||
const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
|
||||
const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
|
||||
const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
|
||||
const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
|
||||
|
||||
float ave_time{0};
|
||||
|
||||
const auto Run = [&](const auto has_hot_loop_,
|
||||
const auto tail_number_,
|
||||
const auto memory_operation_) {
|
||||
constexpr bool has_hot_loop_v = has_hot_loop_.value;
|
||||
constexpr auto tail_number_v = tail_number_.value;
|
||||
constexpr auto scheduler = GemmConfig::Scheduler;
|
||||
constexpr auto memory_operation = memory_operation_.value;
|
||||
|
||||
using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
GemmUniversalTraits,
|
||||
scheduler,
|
||||
has_hot_loop_v,
|
||||
tail_number_v>;
|
||||
|
||||
using GemmPipeline = typename PipelineTypeTraits<
|
||||
GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
|
||||
using GemmEpilogue = ck_tile::CShuffleEpilogue<
|
||||
ck_tile::CShuffleEpilogueProblem<ADataType,
|
||||
BDataType,
|
||||
DsDataType,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
CDEElementWise,
|
||||
TilePartitioner::MPerBlock,
|
||||
TilePartitioner::NPerBlock,
|
||||
GemmConfig::M_Warp,
|
||||
GemmConfig::N_Warp,
|
||||
GemmConfig::M_Warp_Tile,
|
||||
GemmConfig::N_Warp_Tile,
|
||||
GemmConfig::K_Warp_Tile,
|
||||
UniversalGemmProblem::TransposeC,
|
||||
memory_operation,
|
||||
GemmConfig::NumWaveGroups>>;
|
||||
using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
|
||||
auto kargs = Kernel::MakeKernelArgs(args);
|
||||
|
||||
dim3 grids;
|
||||
if constexpr(Persistent)
|
||||
{
|
||||
grids = Kernel::MaxOccupancyGridSize(s);
|
||||
}
|
||||
else
|
||||
{
|
||||
grids = Kernel::GridSize(args.M, args.N, args.k_batch);
|
||||
}
|
||||
const dim3 blocks = Kernel::BlockSize();
|
||||
|
||||
if(!Kernel::IsSupportedArgument(kargs))
|
||||
{
|
||||
throw ArgumentsNotSupportedException(
|
||||
"Wrong! Arguments not supported! Skipping gemm!\n");
|
||||
}
|
||||
|
||||
if(s.log_level_ > 0)
|
||||
{
|
||||
std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
|
||||
<< "shape: " << GemmShape::GetName() << '\n'
|
||||
<< "problem: " << GemmPipelineProblem::GetName() << '\n'
|
||||
<< "pipeline: " << GemmPipeline::GetName() << '\n'
|
||||
<< "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
|
||||
<< ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
|
||||
<< std::endl;
|
||||
}
|
||||
if(s.flush_cache_)
|
||||
{
|
||||
std::cout << "Flushing cache..." << std::endl;
|
||||
static constexpr ck_tile::index_t APackedSize =
|
||||
std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
|
||||
static constexpr ck_tile::index_t BPackedSize =
|
||||
std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
|
||||
|
||||
ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
|
||||
args.M, args.K, args.stride_A, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
|
||||
args.K, args.N, args.stride_B, is_row_major(BLayout{})));
|
||||
|
||||
auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
|
||||
auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
|
||||
|
||||
ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
|
||||
kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
|
||||
rotating_mem.Print();
|
||||
|
||||
auto run_flush_cache = [&]() {
|
||||
// flush icache
|
||||
ck_tile::flush_icache();
|
||||
// rotating mem
|
||||
rotating_mem.Next();
|
||||
// clear c mem
|
||||
if(args.k_batch > 1)
|
||||
hipGetErrorString(hipMemsetAsync(
|
||||
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
|
||||
};
|
||||
ave_time = ck_tile::launch_kernel_time_mask(
|
||||
s,
|
||||
run_flush_cache,
|
||||
ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
|
||||
}
|
||||
else
|
||||
{
|
||||
ave_time = ck_tile::launch_kernel(
|
||||
s,
|
||||
ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
|
||||
}
|
||||
return ave_time;
|
||||
};
|
||||
|
||||
const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
|
||||
//if(args.k_batch == 1)
|
||||
//{
|
||||
Run(has_hot_loop_,
|
||||
tail_number_,
|
||||
ck_tile::integral_constant<ck_tile::memory_operation_enum,
|
||||
ck_tile::memory_operation_enum::set>{});
|
||||
//}
|
||||
//else
|
||||
//{
|
||||
// Run(has_hot_loop_,
|
||||
// tail_number_,
|
||||
// ck_tile::integral_constant<ck_tile::memory_operation_enum,
|
||||
// ck_tile::memory_operation_enum::atomic_add>{});
|
||||
//}
|
||||
};
|
||||
|
||||
BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
template <typename GemmConfig,
|
||||
typename APrecType,
|
||||
typename BPrecType = APrecType,
|
||||
typename CPrecType = APrecType>
|
||||
bool run_gemm_test_prec_type(std::string a_layout,
|
||||
std::string b_layout,
|
||||
ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
using Row = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
|
||||
{
|
||||
if(a_layout == "R" && b_layout == "C")
|
||||
{
|
||||
return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
|
||||
arg_parser, Row{}, Col{}, Row{});
|
||||
}
|
||||
else if(a_layout == "C" && b_layout == "C")
|
||||
{
|
||||
return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
|
||||
arg_parser, Col{}, Col{}, Row{});
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported memory layout for the input matrices when "
|
||||
"BPrecType is ck_tile::pk_int4_t!");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(a_layout == "R" && b_layout == "R")
|
||||
{
|
||||
return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
|
||||
arg_parser, Row{}, Row{}, Row{});
|
||||
}
|
||||
else if(a_layout == "R" && b_layout == "C")
|
||||
{
|
||||
return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
|
||||
arg_parser, Row{}, Col{}, Row{});
|
||||
}
|
||||
else if(a_layout == "C" && b_layout == "R")
|
||||
{
|
||||
return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
|
||||
arg_parser, Col{}, Row{}, Row{});
|
||||
}
|
||||
else if(a_layout == "C" && b_layout == "C")
|
||||
{
|
||||
return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
|
||||
arg_parser, Col{}, Col{}, Row{});
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported memory layout for the input matrices!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GemmConfig, typename APrecType, typename BPrecType, typename CPrecType>
|
||||
bool run_gemm_test(int argc, char* argv[])
|
||||
{
|
||||
auto [result, arg_parser] = create_args(argc, argv);
|
||||
if(!result)
|
||||
return false;
|
||||
|
||||
std::string a_layout = arg_parser.get_str("a_layout");
|
||||
std::string b_layout = arg_parser.get_str("b_layout");
|
||||
|
||||
return run_gemm_test_prec_type<GemmConfig, APrecType, BPrecType, CPrecType>(
|
||||
a_layout, b_layout, arg_parser);
|
||||
}
|
||||
|
||||
template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
|
||||
int run_gemm_combinations()
|
||||
{
|
||||
// Define possible values for each parameter
|
||||
std::vector<std::string> m_values = {"512", "1024"};
|
||||
std::vector<std::string> n_values = {"512", "2048"};
|
||||
std::vector<std::string> k_values = {"512", "1024"};
|
||||
|
||||
// We'll store all our arguments as strings first
|
||||
std::vector<std::string> arg_strings = {"./bin/tile_example_gemm_universal",
|
||||
"", // m placeholder
|
||||
"", // n placeholder
|
||||
"", // k placeholder
|
||||
"-stride_a=0",
|
||||
"-stride_b=0",
|
||||
"-stride_c=0",
|
||||
"-v=2",
|
||||
"-warmup=0",
|
||||
"-repeat=1"};
|
||||
|
||||
// Create an array of const char pointers for argv
|
||||
constexpr size_t ARG_COUNT = 10;
|
||||
constexpr size_t ARG_MAX_LEN = 64;
|
||||
char args[ARG_COUNT][ARG_MAX_LEN];
|
||||
char* argv[ARG_COUNT];
|
||||
|
||||
// Run all combinations
|
||||
bool is_success = true;
|
||||
for(const auto& m : m_values)
|
||||
{
|
||||
arg_strings[1] = "-m=" + m;
|
||||
|
||||
for(const auto& n : n_values)
|
||||
{
|
||||
arg_strings[2] = "-n=" + n;
|
||||
|
||||
for(const auto& k : k_values)
|
||||
{
|
||||
arg_strings[3] = "-k=" + k;
|
||||
|
||||
// Set up the argv array with pointers to the string data
|
||||
for(size_t i = 0; i < ARG_COUNT; i++)
|
||||
{
|
||||
strncpy(args[i], arg_strings[i].c_str(), ARG_MAX_LEN);
|
||||
argv[i] = args[i];
|
||||
}
|
||||
|
||||
std::cout << "Arguments received: ";
|
||||
for(size_t i = 1; i < ARG_COUNT; ++i)
|
||||
{
|
||||
std::cout << argv[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
// Call the function with the current configuration
|
||||
try
|
||||
{
|
||||
#if CK_TILE_USE_WMMA
|
||||
is_success = run_gemm_test<GemmConfigComputeV3_WMMA<CPrecType>,
|
||||
APrecType,
|
||||
BPrecType,
|
||||
CPrecType>(ARG_COUNT, argv) &&
|
||||
is_success;
|
||||
#else
|
||||
is_success = run_gemm_test<GemmConfigComputeV3<CPrecType>,
|
||||
APrecType,
|
||||
BPrecType,
|
||||
CPrecType>(ARG_COUNT, argv) &&
|
||||
is_success;
|
||||
is_success = run_gemm_test<GemmConfigComputeV3_2<CPrecType>,
|
||||
APrecType,
|
||||
BPrecType,
|
||||
CPrecType>(ARG_COUNT, argv) &&
|
||||
is_success;
|
||||
#endif
|
||||
}
|
||||
catch(const ArgumentsNotSupportedException& e)
|
||||
{
|
||||
std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
|
||||
// ArgumentsNotSupportedException is not an error. Do not change is_success
|
||||
}
|
||||
catch(const std::runtime_error& e)
|
||||
{
|
||||
std::cerr << "Caught runtime error: " << e.what() << '\n';
|
||||
is_success = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return is_success;
|
||||
}
|
||||
@@ -120,9 +120,9 @@ TYPED_TEST(TEST_SUITE_NAME, PaddK)
|
||||
|
||||
TYPED_TEST(TEST_SUITE_NAME, Regular)
|
||||
{
|
||||
std::vector<int> Ms{512};
|
||||
constexpr int N = 1024;
|
||||
constexpr int K = 512;
|
||||
std::vector<int> Ms{128};
|
||||
constexpr int N = 128;
|
||||
constexpr int K = 128;
|
||||
|
||||
for(int M : Ms)
|
||||
this->Run(M, N, K);
|
||||
|
||||
@@ -174,6 +174,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
Persistent,
|
||||
NumWaveGroup,
|
||||
preshuffle>;
|
||||
printf("[DEBUG] VectorSize_: %d\n", GemmUniversalTraits::_VectorSize);
|
||||
|
||||
using GemmPipelineProblem =
|
||||
ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
|
||||
@@ -224,6 +225,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
K_Warp_Tile,
|
||||
UniversalGemmProblem::TransposeC,
|
||||
memory_operation>>;
|
||||
//GemmEpilogue::Problem::PrintInfo();
|
||||
//GemmEpilogue::PrintInfo();
|
||||
|
||||
using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
|
||||
auto kargs = Kernel::MakeKernelArgs(args);
|
||||
@@ -256,20 +259,20 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
};
|
||||
|
||||
const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
|
||||
if(args.k_batch == 1)
|
||||
{
|
||||
//if(args.k_batch == 1)
|
||||
//{
|
||||
Run(has_hot_loop_,
|
||||
tail_number_,
|
||||
ck_tile::integral_constant<ck_tile::memory_operation_enum,
|
||||
ck_tile::memory_operation_enum::set>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
Run(has_hot_loop_,
|
||||
tail_number_,
|
||||
ck_tile::integral_constant<ck_tile::memory_operation_enum,
|
||||
ck_tile::memory_operation_enum::atomic_add>{});
|
||||
}
|
||||
//}
|
||||
//else
|
||||
//{
|
||||
// Run(has_hot_loop_,
|
||||
// tail_number_,
|
||||
// ck_tile::integral_constant<ck_tile::memory_operation_enum,
|
||||
// ck_tile::memory_operation_enum::atomic_add>{});
|
||||
//}
|
||||
};
|
||||
|
||||
BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
|
||||
@@ -284,17 +287,17 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
{
|
||||
GTEST_SKIP() << "Unsupported data type combination for gemm pipeline test.";
|
||||
}
|
||||
if constexpr(PipelineType == GemmPipelineType::CompV4 ||
|
||||
std::is_same_v<BDataType, ck_tile::pk_int4_t>)
|
||||
{
|
||||
//if constexpr(PipelineType == GemmPipelineType::CompV4 ||
|
||||
// std::is_same_v<BDataType, ck_tile::pk_int4_t>)
|
||||
//{
|
||||
// Only do k_batch = 1 when pipeline is CompV4, or BDataType is I4
|
||||
k_batches_ = {1};
|
||||
}
|
||||
else
|
||||
{
|
||||
// Otherwise, use k_batch = 1 and 2
|
||||
k_batches_ = {1, 2};
|
||||
}
|
||||
k_batches_ = {1};
|
||||
//}
|
||||
//else
|
||||
//{
|
||||
// // Otherwise, use k_batch = 1 and 2
|
||||
// k_batches_ = {1, 2};
|
||||
//}
|
||||
}
|
||||
|
||||
template <bool PadM = true, bool PadN = true, bool PadK = true, bool Preshuffle = false>
|
||||
@@ -338,8 +341,15 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
ck_tile::HostTensor<CDataType> c_m_n_dev_result(
|
||||
ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
|
||||
|
||||
ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5, 11939}(a_m_k);
|
||||
ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5, 11940}(b_k_n);
|
||||
std::cout << "a_m_k: ";
|
||||
a_m_k.print_first_n(std::cout) << '\n';
|
||||
std::cout << "b_k_n: ";
|
||||
b_k_n.print_first_n(std::cout) << '\n';
|
||||
std::cout << "c_m_n_dev_result: ";
|
||||
c_m_n_dev_result.print_first_n(std::cout) << '\n';
|
||||
|
||||
ck_tile::FillUniformDistributionIntegerValue<ADataType>{1, 2, 11939}(a_m_k);
|
||||
ck_tile::FillUniformDistributionIntegerValue<BDataType>{1, 2, 11940}(b_k_n);
|
||||
|
||||
ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
|
||||
@@ -372,7 +382,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
stride_B,
|
||||
stride_C};
|
||||
|
||||
invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false});
|
||||
invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false, 2});
|
||||
|
||||
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
|
||||
bool pass = true;
|
||||
@@ -384,6 +394,13 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
|
||||
a_m_k, b_k_n, c_m_n_host_ref);
|
||||
|
||||
std::cout << "a_m_k: ";
|
||||
a_m_k.print_first_n(std::cout) << '\n';
|
||||
std::cout << "b_k_n: ";
|
||||
b_k_n.print_first_n(std::cout) << '\n';
|
||||
std::cout << "c_m_n_dev_result: ";
|
||||
c_m_n_dev_result.print_first_n(std::cout) << '\n';
|
||||
|
||||
const float max_accumulated_value =
|
||||
*std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
|
||||
const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
|
||||
|
||||
Reference in New Issue
Block a user