From 01bd52bdb5fdf55f4047cce5e4abc1ebbf302f5f Mon Sep 17 00:00:00 2001 From: Yi DING <28386673+DDEle@users.noreply.github.com> Date: Wed, 3 Jun 2026 02:09:05 +0000 Subject: [PATCH] [rocm-libraries] ROCm/rocm-libraries#7925 (commit a8f0845) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [CK] Fix gfx950 AITER Sync Regressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Fixes three gfx950 regressions in the AITER downstream CI that surfaced after the internal/gfx1250 re-sync (ROCm/rocm-libraries#6978): > **Companion aiter PR:** ROCm/aiter#3392 — host-side adaptations (`Kernel::BlockSize()` `constexpr` drops, blockscale `KBatch=1` clamp) plus the CK submodule bump used to validate these fixes together. - **FlyDSL MoE AOT cache miss** — the AITER MoE tests run with `check_aot_cache=True` and fail on any FlyDSL JIT cache miss, but the CI never pre-compiles the FlyDSL MoE kernels, so gfx950 always misses. Pre-compile them at the start of the AITER test stage. - **`buffer.load.lds.v4i32` link error** — ROCm/rocm-libraries#6978 reintroduced a clang-version guard mapping `llvm.amdgcn.raw.buffer.load.lds` to a `.v4i32`-suffixed name. That name exists in no LLVM (the rsrc operand is a fixed, non-overloaded `<4 x i32>`, so the intrinsic is never type-mangled), so gfx950 4-DWORD direct-to-LDS (e.g. fp4 MoE bpreshuffle) fails to link with `lld: undefined symbol: llvm.amdgcn.raw.buffer.load.lds.v4i32`. Use the canonical plain name unconditionally. - **mixed-precision flatmm warp-GEMM call** — ROCm/rocm-libraries#6978 generalized the scaled `WarpGemmImpl::operator()` from a fixed `` signature to a variadic `` one and updated the `mx_flatmm` pipeline to pass the op-selectors as `OpSelA<>`/`OpSelB<>` types, but missed the mixed-precision flatmm pipeline (`F8xMXF4`/`F16xMXF4`), which still passed raw integer op-selectors. These no longer bind to `typename... Params` (`error: no matching member function for call to 'operator()'`), breaking compilation of the fp8/bf16 × fp4 cktile MoE gemm1 instances on gfx950 (aiter `test_moe_2stage`). Wrap the op-selectors in `OpSelA<>`/`OpSelB<>`. ## Changes - `Jenkinsfile`: pre-compile the FlyDSL MoE AOT cache (`python3 aiter/aot/flydsl/moe.py`) before the AITER tests. - `include/ck/utility/amd_buffer_addressing_builtins.hpp` and `include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp`: drop the `__clang_major__` guard and always use `__asm("llvm.amdgcn.raw.buffer.load.lds")`. The plain name is the canonical one for all sizes including the gfx950 16-byte form, as the upstream LLVM gfx950 tests confirm. - `include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp`: wrap the warp-GEMM op-selectors in `OpSelA<>`/`OpSelB<>` at the five call sites, matching the `mx_flatmm` pipeline. ## Test plan Validated via CI. --- .../ck/utility/amd_buffer_addressing_builtins.hpp | 11 ----------- .../core/arch/amd_buffer_addressing_builtins.hpp | 11 ----------- ...ixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp | 12 +++++++----- vars/ck.groovy | 2 ++ 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp index e35986177a..a8e2cc4881 100644 --- a/include/ck/utility/amd_buffer_addressing_builtins.hpp +++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp @@ -830,16 +830,6 @@ amd_buffer_atomic_max(const typename vector_type_maker::type::type src_thr } // Direct loads from global to LDS. -#if __clang_major__ >= 21 && __clang_major__ < 23 -__device__ void -llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, - __attribute__((address_space(3))) uint32_t* lds_ptr, - index_t size, - index_t voffset, - index_t soffset, - index_t offset, - index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32"); -#else __device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, __attribute__((address_space(3))) uint32_t* lds_ptr, @@ -848,7 +838,6 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); -#endif #ifndef __HIPCC_RTC__ template diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 55597b3723..448394dd43 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -1381,16 +1381,6 @@ CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64( int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32"); // Direct loads from global to LDS. -#if __clang_major__ >= 21 && __clang_major__ < 23 -CK_TILE_DEVICE_EXTERN void -llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, - as3_uint32_ptr lds_ptr, - index_t size, - index_t voffset, - index_t soffset, - index_t offset, - index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32"); -#else CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, as3_uint32_ptr lds_ptr, @@ -1399,7 +1389,6 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); -#endif template CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem, diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp index eb1df36ea6..f75cd25bd6 100644 --- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -1982,7 +1982,7 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 // warp GEMM WG{}.template // operator()( - operator()( + operator(), OpSelB>( c_warp_tensor, a_warp_tensor(number{}), b_warp_tensor_ping(nIter_pack * number{} + @@ -2092,7 +2092,7 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 // warp GEMM WG{}.template // operator()( - operator()( + operator(), OpSelB>( c_warp_tensor, a_warp_tensor(number{}), b_warp_tensor_pong(nIter_pack * number{} + @@ -2214,7 +2214,8 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); // warp GEMM - WG{}.template operator()( + WG{}.template + operator(), OpSelB>( c_warp_tensor, a_warp_tensor(number{}), b_warp_tensor_ping(nIter_pack * number{} + @@ -2283,7 +2284,8 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); // warp GEMM - WG{}.template operator()( + WG{}.template + operator(), OpSelB>( // operator()( c_warp_tensor, a_warp_tensor(number{}), @@ -2346,7 +2348,7 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 // warp GEMM WG{}.template // operator()( - operator()( + operator(), OpSelB>( c_warp_tensor, a_warp_tensor(number{}), b_warp_tensor_ping(nIter_pack * number{} + diff --git a/vars/ck.groovy b/vars/ck.groovy index c71564d898..c97cfd81e3 100644 --- a/vars/ck.groovy +++ b/vars/ck.groovy @@ -1180,6 +1180,8 @@ def getPytorchTestsCmds() { } def getAiterTestsCmds() { return [ + // Pre-compile FlyDSL MoE AOT cache before the tests. + "cd /home/jenkins/workspace/aiter && python3 aiter/aot/flydsl/moe.py", "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py", "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py", "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py",