[CK TILE] Implement cschuflle algorithm (#1842)

* [CK TILE] Implement cschuflle algorithm

* Rebase

* Vector store size fixes

* fixes

* Fixes

* fixes

* fmha fix

* fixes

* fixes of fixes
This commit is contained in:
Bartłomiej Kocot
2025-01-30 11:57:39 +01:00
committed by GitHub
parent c5fff071e5
commit 25e2e0f04a
18 changed files with 408 additions and 371 deletions

View File

@@ -159,12 +159,8 @@ struct GemmKernel
CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs)
{
constexpr bool is_output_c_reg_transposed =
EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC();
if constexpr(!((GemmPipeline::VectorSizeC % 2 == 0 &&
std::is_same_v<CLayout, tensor_layout::gemm::RowMajor> &&
is_output_c_reg_transposed) ||
!(std::is_same_v<CDataType, fp16_t> || std::is_same_v<CDataType, bf16_t>)))
if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
is_any_of<CDataType, fp16_t, bf16_t>::value)
{
if(kargs.KBatch != 1)
{
@@ -182,7 +178,7 @@ struct GemmKernel
<< std::endl;
return false;
}
if(kargs.K % GemmPipeline::VectorSizeA != 0)
if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
{
std::cerr << "K is not a multiple of vector load size for A tensor!" << std::endl;
return false;
@@ -197,7 +193,7 @@ struct GemmKernel
<< std::endl;
return false;
}
if(kargs.M % GemmPipeline::VectorSizeA != 0)
if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
{
std::cerr << "M is not a multiple of vector load size for A tensor!" << std::endl;
return false;
@@ -213,7 +209,7 @@ struct GemmKernel
<< std::endl;
return false;
}
if(kargs.N % GemmPipeline::VectorSizeB != 0)
if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
{
std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
return false;
@@ -228,7 +224,7 @@ struct GemmKernel
<< std::endl;
return false;
}
if(kargs.K % GemmPipeline::VectorSizeB != 0)
if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
{
std::cerr << "K is not a multiple of vector load size for B tensor!" << std::endl;
return false;
@@ -244,7 +240,7 @@ struct GemmKernel
<< std::endl;
return false;
}
if(kargs.N % GemmPipeline::VectorSizeC != 0)
if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
{
std::cerr << "N is not a multiple of vector load size for C tensor!" << std::endl;
return false;
@@ -259,7 +255,7 @@ struct GemmKernel
<< std::endl;
return false;
}
if(kargs.M % GemmPipeline::VectorSizeC != 0)
if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
{
std::cerr << "M is not a multiple of vector load size for C tensor!" << std::endl;
return false;
@@ -275,14 +271,6 @@ struct GemmKernel
const GemmKernelArgs& kargs,
const SplitKBatchOffset& splitk_batch_offset)
{
// const auto idxs = TilePartitioner{}();
// const auto i_m = idxs.at(number<0>{});
// const auto i_n = idxs.at(number<1>{});
// // options
// const ADataType* a_start = static_cast<const ADataType*>(kargs.a_ptr);
// const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr);
// // Convert pointers to tensor views
// auto a_tensor_view = [&]() {
const auto& a_tensor_view = [&]() {
if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
{
@@ -290,7 +278,7 @@ struct GemmKernel
a_ptr,
make_tuple(kargs.M, splitk_batch_offset.splitted_k),
make_tuple(kargs.stride_A, 1),
number<GemmPipeline::VectorSizeA>{},
number<GemmPipeline::GetVectorSizeA()>{},
number<1>{});
}
else
@@ -299,7 +287,7 @@ struct GemmKernel
a_ptr,
make_tuple(splitk_batch_offset.splitted_k, kargs.M),
make_tuple(kargs.stride_A, 1),
number<GemmPipeline::VectorSizeA>{},
number<GemmPipeline::GetVectorSizeA()>{},
number<1>{});
}
}();
@@ -311,7 +299,7 @@ struct GemmKernel
b_ptr,
make_tuple(splitk_batch_offset.splitted_k, kargs.N),
make_tuple(kargs.stride_B, 1),
number<GemmPipeline::VectorSizeB>{},
number<GemmPipeline::GetVectorSizeB()>{},
number<1>{});
}
else
@@ -320,7 +308,7 @@ struct GemmKernel
b_ptr,
make_tuple(kargs.N, splitk_batch_offset.splitted_k),
make_tuple(kargs.stride_B, 1),
number<GemmPipeline::VectorSizeB>{},
number<GemmPipeline::GetVectorSizeB()>{},
number<1>{});
}
}();
@@ -333,7 +321,7 @@ struct GemmKernel
c_ptr,
make_tuple(kargs.M, kargs.N),
make_tuple(kargs.stride_C, 1),
number<GemmPipeline::VectorSizeC>{},
number<EpiloguePipeline::GetVectorSizeC()>{},
number<1>{});
}
else
@@ -501,16 +489,13 @@ struct GemmKernel
// Run Epilogue Pipeline
auto& c_block_window = gemm_tile_windows.at(I2);
constexpr bool is_output_c_reg_transposed =
EpiloguePipeline::IsOutputTransposed() != GemmPipeline::IsTransposeC();
if constexpr((DstInMemOp == memory_operation_enum::set) || (sizeof(CDataType) > 2) ||
(GemmPipeline::VectorSizeC % 2 == 0 &&
std::is_same_v<CLayout, tensor_layout::gemm::RowMajor> &&
is_output_c_reg_transposed))
if constexpr(DstInMemOp == memory_operation_enum::set ||
!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
is_any_of<CDataType, fp16_t, bf16_t>::value))
{
EpiloguePipeline{}
.template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
c_block_window, c_block_tile);
c_block_window, c_block_tile, smem_ptr);
}
}