Extend XDL kernel to Support RDNA3/4 - Part 4 (#2724)

* Fix example * fix build error * update pk_i4 & moe test case * fix all instance build (examples) * fix batched_gemm_gemm (example) * disable example_gemm_bias_softmax_gemm_permute on gfx11 * remove unnecessary disable gfx11 * update tests * update tests2
2026-05-04 05:31:24 +00:00 · 2025-09-12 23:17:07 +08:00
parent bca99a499d
commit 321627aec5
123 changed files with 848 additions and 574 deletions
--- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -68,10 +68,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     64,                         // KPerBlock,
     16,                         // AK1,
     16,                         // BK1,
-     32,                         // MPerXDL,
-     32,                         // NPerXDL,
-     4,                          // MXdlPerWave,
-     2,                          // NXdlPerWave,
+     16,                         // MPerXDL,
+     16,                         // NPerXDL,
+     8,                          // MXdlPerWave,
+     4,                          // NXdlPerWave,
     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
@@ -88,8 +88,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl
     1,                          // bool BBlockLdsExtraN,
     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
-     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     4>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
 // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::