diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp index e35986177a..a8e2cc4881 100644 --- a/include/ck/utility/amd_buffer_addressing_builtins.hpp +++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp @@ -830,16 +830,6 @@ amd_buffer_atomic_max(const typename vector_type_maker::type::type src_thr } // Direct loads from global to LDS. -#if __clang_major__ >= 21 && __clang_major__ < 23 -__device__ void -llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, - __attribute__((address_space(3))) uint32_t* lds_ptr, - index_t size, - index_t voffset, - index_t soffset, - index_t offset, - index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32"); -#else __device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, __attribute__((address_space(3))) uint32_t* lds_ptr, @@ -848,7 +838,6 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); -#endif #ifndef __HIPCC_RTC__ template diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 55597b3723..448394dd43 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -1381,16 +1381,6 @@ CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64( int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32"); // Direct loads from global to LDS. -#if __clang_major__ >= 21 && __clang_major__ < 23 -CK_TILE_DEVICE_EXTERN void -llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, - as3_uint32_ptr lds_ptr, - index_t size, - index_t voffset, - index_t soffset, - index_t offset, - index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32"); -#else CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, as3_uint32_ptr lds_ptr, @@ -1399,7 +1389,6 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); -#endif template CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem, diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp index eb1df36ea6..f75cd25bd6 100644 --- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -1982,7 +1982,7 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 // warp GEMM WG{}.template // operator()( - operator()( + operator(), OpSelB>( c_warp_tensor, a_warp_tensor(number{}), b_warp_tensor_ping(nIter_pack * number{} + @@ -2092,7 +2092,7 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 // warp GEMM WG{}.template // operator()( - operator()( + operator(), OpSelB>( c_warp_tensor, a_warp_tensor(number{}), b_warp_tensor_pong(nIter_pack * number{} + @@ -2214,7 +2214,8 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); // warp GEMM - WG{}.template operator()( + WG{}.template + operator(), OpSelB>( c_warp_tensor, a_warp_tensor(number{}), b_warp_tensor_ping(nIter_pack * number{} + @@ -2283,7 +2284,8 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 merge_sequences(sequence<1, 1>{}, c_warp_y_lengths)); // warp GEMM - WG{}.template operator()( + WG{}.template + operator(), OpSelB>( // operator()( c_warp_tensor, a_warp_tensor(number{}), @@ -2346,7 +2348,7 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1 // warp GEMM WG{}.template // operator()( - operator()( + operator(), OpSelB>( c_warp_tensor, a_warp_tensor(number{}), b_warp_tensor_ping(nIter_pack * number{} + diff --git a/vars/ck.groovy b/vars/ck.groovy index c71564d898..c97cfd81e3 100644 --- a/vars/ck.groovy +++ b/vars/ck.groovy @@ -1180,6 +1180,8 @@ def getPytorchTestsCmds() { } def getAiterTestsCmds() { return [ + // Pre-compile FlyDSL MoE AOT cache before the tests. + "cd /home/jenkins/workspace/aiter && python3 aiter/aot/flydsl/moe.py", "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py", "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py", "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py",