mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-01 20:21:23 +00:00
[CK TILE] Implement cschuflle algorithm (#1842)
* [CK TILE] Implement cschuflle algorithm * Rebase * Vector store size fixes * fixes * Fixes * fixes * fmha fix * fixes * fixes of fixes
This commit is contained in:
@@ -159,12 +159,8 @@ struct GemmKernel
|
||||
|
||||
CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs)
|
||||
{
|
||||
constexpr bool is_output_c_reg_transposed =
|
||||
EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC();
|
||||
if constexpr(!((GemmPipeline::VectorSizeC % 2 == 0 &&
|
||||
std::is_same_v<CLayout, tensor_layout::gemm::RowMajor> &&
|
||||
is_output_c_reg_transposed) ||
|
||||
!(std::is_same_v<CDataType, fp16_t> || std::is_same_v<CDataType, bf16_t>)))
|
||||
if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
|
||||
is_any_of<CDataType, fp16_t, bf16_t>::value)
|
||||
{
|
||||
if(kargs.KBatch != 1)
|
||||
{
|
||||
@@ -182,7 +178,7 @@ struct GemmKernel
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
if(kargs.K % GemmPipeline::VectorSizeA != 0)
|
||||
if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
|
||||
{
|
||||
std::cerr << "K is not a multiple of vector load size for A tensor!" << std::endl;
|
||||
return false;
|
||||
@@ -197,7 +193,7 @@ struct GemmKernel
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
if(kargs.M % GemmPipeline::VectorSizeA != 0)
|
||||
if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
|
||||
{
|
||||
std::cerr << "M is not a multiple of vector load size for A tensor!" << std::endl;
|
||||
return false;
|
||||
@@ -213,7 +209,7 @@ struct GemmKernel
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
if(kargs.N % GemmPipeline::VectorSizeB != 0)
|
||||
if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
|
||||
{
|
||||
std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
|
||||
return false;
|
||||
@@ -228,7 +224,7 @@ struct GemmKernel
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
if(kargs.K % GemmPipeline::VectorSizeB != 0)
|
||||
if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
|
||||
{
|
||||
std::cerr << "K is not a multiple of vector load size for B tensor!" << std::endl;
|
||||
return false;
|
||||
@@ -244,7 +240,7 @@ struct GemmKernel
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
if(kargs.N % GemmPipeline::VectorSizeC != 0)
|
||||
if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
|
||||
{
|
||||
std::cerr << "N is not a multiple of vector load size for C tensor!" << std::endl;
|
||||
return false;
|
||||
@@ -259,7 +255,7 @@ struct GemmKernel
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
if(kargs.M % GemmPipeline::VectorSizeC != 0)
|
||||
if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
|
||||
{
|
||||
std::cerr << "M is not a multiple of vector load size for C tensor!" << std::endl;
|
||||
return false;
|
||||
@@ -275,14 +271,6 @@ struct GemmKernel
|
||||
const GemmKernelArgs& kargs,
|
||||
const SplitKBatchOffset& splitk_batch_offset)
|
||||
{
|
||||
// const auto idxs = TilePartitioner{}();
|
||||
// const auto i_m = idxs.at(number<0>{});
|
||||
// const auto i_n = idxs.at(number<1>{});
|
||||
// // options
|
||||
// const ADataType* a_start = static_cast<const ADataType*>(kargs.a_ptr);
|
||||
// const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr);
|
||||
// // Convert pointers to tensor views
|
||||
// auto a_tensor_view = [&]() {
|
||||
const auto& a_tensor_view = [&]() {
|
||||
if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
@@ -290,7 +278,7 @@ struct GemmKernel
|
||||
a_ptr,
|
||||
make_tuple(kargs.M, splitk_batch_offset.splitted_k),
|
||||
make_tuple(kargs.stride_A, 1),
|
||||
number<GemmPipeline::VectorSizeA>{},
|
||||
number<GemmPipeline::GetVectorSizeA()>{},
|
||||
number<1>{});
|
||||
}
|
||||
else
|
||||
@@ -299,7 +287,7 @@ struct GemmKernel
|
||||
a_ptr,
|
||||
make_tuple(splitk_batch_offset.splitted_k, kargs.M),
|
||||
make_tuple(kargs.stride_A, 1),
|
||||
number<GemmPipeline::VectorSizeA>{},
|
||||
number<GemmPipeline::GetVectorSizeA()>{},
|
||||
number<1>{});
|
||||
}
|
||||
}();
|
||||
@@ -311,7 +299,7 @@ struct GemmKernel
|
||||
b_ptr,
|
||||
make_tuple(splitk_batch_offset.splitted_k, kargs.N),
|
||||
make_tuple(kargs.stride_B, 1),
|
||||
number<GemmPipeline::VectorSizeB>{},
|
||||
number<GemmPipeline::GetVectorSizeB()>{},
|
||||
number<1>{});
|
||||
}
|
||||
else
|
||||
@@ -320,7 +308,7 @@ struct GemmKernel
|
||||
b_ptr,
|
||||
make_tuple(kargs.N, splitk_batch_offset.splitted_k),
|
||||
make_tuple(kargs.stride_B, 1),
|
||||
number<GemmPipeline::VectorSizeB>{},
|
||||
number<GemmPipeline::GetVectorSizeB()>{},
|
||||
number<1>{});
|
||||
}
|
||||
}();
|
||||
@@ -333,7 +321,7 @@ struct GemmKernel
|
||||
c_ptr,
|
||||
make_tuple(kargs.M, kargs.N),
|
||||
make_tuple(kargs.stride_C, 1),
|
||||
number<GemmPipeline::VectorSizeC>{},
|
||||
number<EpiloguePipeline::GetVectorSizeC()>{},
|
||||
number<1>{});
|
||||
}
|
||||
else
|
||||
@@ -501,16 +489,13 @@ struct GemmKernel
|
||||
// Run Epilogue Pipeline
|
||||
auto& c_block_window = gemm_tile_windows.at(I2);
|
||||
|
||||
constexpr bool is_output_c_reg_transposed =
|
||||
EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC();
|
||||
if constexpr((DstInMemOp == memory_operation_enum::set) || (sizeof(CDataType) > 2) ||
|
||||
(GemmPipeline::VectorSizeC % 2 == 0 &&
|
||||
std::is_same_v<CLayout, tensor_layout::gemm::RowMajor> &&
|
||||
is_output_c_reg_transposed))
|
||||
if constexpr(DstInMemOp == memory_operation_enum::set ||
|
||||
!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
|
||||
is_any_of<CDataType, fp16_t, bf16_t>::value))
|
||||
{
|
||||
EpiloguePipeline{}
|
||||
.template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
|
||||
c_block_window, c_block_tile);
|
||||
c_block_window, c_block_tile, smem_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user