Merge commit 'b03764ca5a917752845ddbb5da8886051a16d9be' into develop

This commit is contained in:
assistant-librarian[bot]
2025-10-17 17:11:18 +00:00
parent 99ccb97fad
commit f2f7a548cb
15 changed files with 172 additions and 80 deletions

View File

@@ -11,4 +11,33 @@ enum StreamKReductionStrategy : uint32_t
Atomic = 0u,
Reduction = 1u
};
/**
* @brief Estimates the number of Stream-K workgroups per macro tile in the C tensor.
*
* @param sk_ctas Number of Stream-K workgroups.
* @param iters_per_sk_cta Number of iterations per Stream-K workgroup.
* @param iters_per_tile Number of iterations per tile (i.e., the number of macro tiles in the K
* dimension).
* @return ck_tile::index_t An estimate of the number of workgroups per macro tile in the C tensor.
* @note It is assumed that `iters_per_sk_cta` > 0.
*/
template <ck_tile::StreamKReductionStrategy ReductionStrategy>
ck_tile::index_t
estimate_num_wgs_per_tile(index_t sk_ctas, index_t iters_per_sk_cta, index_t iters_per_tile)
{
// In the case of non-atomic reduction or data-parallel only, there will always be 1 workgroup
// writing final results to a given macro tile in C.
int num_wgs_per_tile = 1;
// Otherwise, for atomics, multiple workgroups may be writing to the same macro tile in C.
if(sk_ctas > 0 && ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
{
// Estimate the number of workgroups per macro tile.
num_wgs_per_tile =
(iters_per_tile / iters_per_sk_cta) + ((iters_per_tile % iters_per_sk_cta) != 0);
}
return std::max(num_wgs_per_tile, 1);
}
} // namespace ck_tile

View File

@@ -33,9 +33,10 @@
#include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp"
#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"