Add v_permlaneb32 for block_reduce. Disable it as it will cause un-coexecutable packed math in FA

2026-05-11 17:00:18 +00:00 · 2025-08-04 10:27:42 +00:00
parent 4f31847de1
commit 0d12fc944f
5 changed files with 101 additions and 52 deletions
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -95,7 +95,15 @@ CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
 template <index_t lgkmcnt = 0>
 CK_TILE_DEVICE void block_sync_lds()
 {
-    __builtin_amdgcn_s_waitcnt(CK_TILE_S_CNT_MAX & CK_TILE_LGKMCNT(lgkmcnt));
+    if constexpr(lgkmcnt > 15)
+    {
+        __builtin_amdgcn_s_waitcnt(CK_TILE_S_CNT_MAX & CK_TILE_LGKMCNT(15));
+    }
+    else
+    {
+        __builtin_amdgcn_s_waitcnt(CK_TILE_S_CNT_MAX & CK_TILE_LGKMCNT(lgkmcnt));
+    }
+
    __builtin_amdgcn_s_barrier();
 }

--- a/include/ck_tile/core/arch/utility.hpp
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -59,6 +59,21 @@ CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
 #endif
 }

+template <typename T>
+CK_TILE_DEVICE auto warp_shuffle_down_pair(const T& v_local)
+{
+    static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
+
+    const int32x2_t x = __builtin_amdgcn_permlane32_swap(
+        bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local), false, false);
+
+    thread_buffer<T, 2> v;
+    v(0) = bit_cast<T>(x[0]);
+    v(1) = bit_cast<T>(x[1]);
+
+    return v;
+}
+
 template <typename T>
 CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
 {