Implement grouped gemm tile loop for RDNA4 (#3304)

* feat: grouped gemm tile loop support for RDNA4

* fix: removed extra parameter from grouped gemm example instance

* fix: FP8 check incorrectly enabling FP8 on RDNA3
This commit is contained in:
Erwin Terpstra
2026-01-13 07:14:23 +01:00
committed by GitHub
parent 141f77aa12
commit eb041079a3
44 changed files with 3067 additions and 1223 deletions

View File

@@ -151,7 +151,10 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
static constexpr index_t PrefillStages = 1;
static constexpr index_t GlobalBufferNum = 1;
static bool BlockHasHotloop(index_t num_loop) { return num_loop > PrefetchStages; }
static bool __host__ __device__ BlockHasHotloop(index_t num_loop)
{
return num_loop > PrefetchStages;
}
static TailNumber BlockLoopTailNum(index_t num_loop)
{
@@ -707,7 +710,10 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
static constexpr index_t PrefillStages = 1;
static constexpr index_t GlobalBufferNum = 1;
static bool BlockHasHotloop(index_t num_loop) { return num_loop > PrefetchStages; }
__host__ __device__ static bool BlockHasHotloop(index_t num_loop)
{
return num_loop > PrefetchStages;
}
static TailNumber BlockLoopTailNum(index_t num_loop)
{