Add other layouts for FP8 block scaled gemm (#2665)

* Start adding other layouts for gemm_ab_scale

* Add some instances

* Create tensor descriptors for A/B scales depending on A/B layout

* Fix formatting

* Revert some comments

* Revert commented instances in CMakeLists.txt

* Add some more instances for col-row gemm

* enable more row,row instances

* Use occupancy=1 for col,row layout to avoid spills
This commit is contained in:
Sami Remes
2025-08-18 11:46:10 +03:00
committed by GitHub
parent 7310830d14
commit 26d3300930
15 changed files with 758 additions and 13 deletions

View File

@@ -231,11 +231,22 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
}
};
constexpr index_t minimum_occupancy =
(BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave &&
MPerBlock * NPerBlock / BlockSize > 64)
? 1
: 2;
constexpr index_t minimum_occupancy = [&]() {
if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout> &&
is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
{
// FIXME: many instances have many spills with occupancy > 1, a better solution
// needed to get best performance
return 1;
}
else
{
return (BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave &&
MPerBlock * NPerBlock / BlockSize > 64)
? 1
: 2;
}
}();
if(has_main_k_block_loop)
{