mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 21:21:22 +00:00
Grouped GEMM for fp16 (#126)
* init of grouped_gemm * 2 gemm test * perf test * clean * wrap desc into a struct * test cast static_arr to pointer * add ptr to GemmDesc * add grouped gemm profiler * fixed mem issue with unique_ptr * clean * clean * finished ckprofiler * Update README.md * readme * fixed readme * add example * improve code * fixed comments: reserve, seperate ptr and gemm_shapes * merge group and non-group * fixed comments: replace push_back with emplace_back to avoid copy constructor * fixed comments: unified blk2ctile; add test * ci fix * fixed ci * fixed ci * fixed ci
This commit is contained in:
@@ -54,6 +54,80 @@ __global__ void
|
||||
block_2_ctile_map);
|
||||
}
|
||||
|
||||
template <typename GridwiseGemm,
|
||||
typename FloatAB,
|
||||
typename FloatC,
|
||||
typename GemmDesc,
|
||||
typename AElementwiseOperation,
|
||||
typename BElementwiseOperation,
|
||||
typename CElementwiseOperation,
|
||||
bool HasMainK0BlockLoop,
|
||||
index_t MaxGroupCount>
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
#endif
|
||||
kernel_grouped_gemm_xdlops_v2r3(
|
||||
const StaticallyIndexedArray<GemmDesc, MaxGroupCount> gemm_desc_,
|
||||
const index_t group_count,
|
||||
const AElementwiseOperation a_element_op,
|
||||
const BElementwiseOperation b_element_op,
|
||||
const CElementwiseOperation c_element_op)
|
||||
{
|
||||
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
|
||||
|
||||
const index_t block_id = get_block_1d_id();
|
||||
|
||||
#if 1
|
||||
static_for<0, MaxGroupCount, 1>{}([&](auto i) {
|
||||
if(block_id >= gemm_desc_[i].BlockStart_ && block_id < gemm_desc_[i].BlockEnd_ &&
|
||||
i < group_count)
|
||||
{
|
||||
auto group_id = i;
|
||||
|
||||
GridwiseGemm::template Run<HasMainK0BlockLoop>(
|
||||
gemm_desc_[group_id].a_ptr,
|
||||
gemm_desc_[group_id].b_ptr,
|
||||
gemm_desc_[group_id].c_ptr,
|
||||
p_shared,
|
||||
gemm_desc_[group_id].a_grid_desc_k0_m_k1_,
|
||||
gemm_desc_[group_id].b_grid_desc_k0_n_k1_,
|
||||
gemm_desc_[group_id].c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op,
|
||||
gemm_desc_[group_id].grouped_gemm_block_2_ctile_map_);
|
||||
}
|
||||
});
|
||||
#else
|
||||
const auto gemm_desc_ptr = reinterpret_cast<const GemmDesc*>(&gemm_desc_);
|
||||
|
||||
index_t group_id = 0;
|
||||
static_for<0, MaxGroupCount, 1>{}([&](auto i) {
|
||||
group_id = (block_id >= gemm_desc_[i].BlockStart && block_id < gemm_desc_[i].BlockEnd &&
|
||||
i < group_count)
|
||||
? i
|
||||
: group_id;
|
||||
});
|
||||
|
||||
const index_t block_id_grp = block_id - gemm_desc_ptr[group_id].BlockStart;
|
||||
|
||||
GridwiseGemm::template Run<HasMainK0BlockLoop>(
|
||||
gemm_desc_ptr[group_id].a_ptr,
|
||||
gemm_desc_ptr[group_id].b_ptr,
|
||||
gemm_desc_ptr[group_id].c_ptr,
|
||||
p_shared,
|
||||
gemm_desc_ptr[group_id].a_grid_desc_k0_m_k1_,
|
||||
gemm_desc_ptr[group_id].b_grid_desc_k0_n_k1_,
|
||||
gemm_desc_ptr[group_id].c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op,
|
||||
gemm_desc_ptr[group_id].block_2_ctile_map_,
|
||||
block_id_grp);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
|
||||
Reference in New Issue
Block a user