Batch applied

2026-06-30 19:57:40 +00:00 · 2025-09-11 20:17:09 +00:00
parent 2ed39f8d91
commit 6be0c1d7a6
2 changed files with 160 additions and 298 deletions
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -13,7 +13,6 @@
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
-#include "ck_tile/core/utility/ignore.hpp"

 // This attribute gives a hint to the compiler that a branch is likely to be taken.
 // Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
@@ -24,8 +23,6 @@
 #define LIKELY(x) (__builtin_expect(!!(x), 1))
 #endif

-using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
-
 namespace ck_tile {

 // 128 bit SGPRs to supply buffer resource in buffer instructions
@@ -41,6 +38,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
    buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
    int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    r.x         = __builtin_amdgcn_readfirstlane(r.x);
+    r.y         = __builtin_amdgcn_readfirstlane(r.y);
+    r.z         = __builtin_amdgcn_readfirstlane(r.z);
+    r.w         = __builtin_amdgcn_readfirstlane(r.w);
    return r;
 }

@@ -298,12 +299,12 @@ struct buffer_load_if<16, pre_nop>
                                   index_t v_offset,
                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 16);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<16, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
        static_assert(sizeof(mbuf_t) == sizeof(T));
        if constexpr(pre_nop)
            asm volatile("s_nop 4\n"
@@ -332,12 +333,12 @@ struct buffer_load_if<8, pre_nop>
                                   index_t v_offset,
                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 8);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<8, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
        if constexpr(pre_nop)
            asm volatile("s_nop 4\n"
                         "v_cmpx_le_u32 exec, 1, %4\n"
@@ -365,12 +366,12 @@ struct buffer_load_if<4, pre_nop>
                                   index_t v_offset,
                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<4, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
        if constexpr(pre_nop)
            asm volatile("s_nop 4\n"
                         "v_cmpx_le_u32 exec, 1, %4\n"
@@ -398,12 +399,12 @@ struct buffer_load_if<2, pre_nop>
                                   index_t v_offset,
                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<2, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
        if constexpr(pre_nop)
            asm volatile("s_nop 4\n"
                         "v_cmpx_le_u32 exec, 1, %4\n"
@@ -431,12 +432,12 @@ struct buffer_load_if<1, pre_nop>
                                   index_t v_offset,
                                   index_t /*s_offset*/,
                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                   bool_constant<pre_nop> = {})
    {
        static_assert(sizeof(T) == 4);
        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<1, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
        if constexpr(pre_nop)
            asm volatile("s_nop 4\n"
                         "v_cmpx_le_u32 exec, 1, %4\n"
@@ -620,7 +621,7 @@ struct buffer_store_if<16>
    {
        static_assert(sizeof(T) == 16);
        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = fp32x4_t;
+        using mbuf_t = fp32x4_t;
        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                     "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
                     "s_mov_b64 exec %5"
@@ -677,7 +678,7 @@ struct buffer_store_if<4>
    {
        static_assert(sizeof(T) == 4);
        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
+        using mbuf_t = float;
        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                     "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
                     "s_mov_b64 exec %5"
@@ -705,7 +706,7 @@ struct buffer_store_if<2>
    {
        static_assert(sizeof(T) == 2);
        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = short;
+        using mbuf_t = short;
        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                     "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
                     "s_mov_b64 exec %5"
@@ -733,7 +734,7 @@ struct buffer_store_if<1>
    {
        static_assert(sizeof(T) == 4);
        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
+        using mbuf_t = float;
        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                     "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
                     "s_mov_b64 exec %5"
@@ -1269,53 +1270,33 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
 // Direct loads from global to LDS.
 CK_TILE_DEVICE_EXTERN void
 llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
-                                as3_uint32_ptr lds_ptr,
+                                __attribute__((address_space(3))) uint32_t* lds_ptr,
                                index_t size,
                                index_t voffset,
                                index_t soffset,
                                index_t offset,
                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");

-template <unsigned num_dwords, bool pre_nop = false>
-CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem,
-                                                int32x4_t rsrc,
-                                                index_t voffset,
-                                                index_t /*soffset*/,
-                                                index_t ioffset /*max 0xFFF*/,
-                                                index_t /*flag*/       = 0,
-                                                bool_constant<pre_nop> = {})
+template <bool pre_nop = false>
+CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
+                                              int32x4_t rsrc,
+                                              index_t voffset,
+                                              index_t /*soffset*/,
+                                              index_t ioffset /*max 0xFFF*/,
+                                              index_t /*flag*/       = 0,
+                                              bool_constant<pre_nop> = {})
 {
-#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr)                            \
-    if constexpr(pre_nop)                                               \
-        asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
-                     : "=r"(smem) /*dummy dependency for smem*/         \
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)            \
-                     : "memory");                                       \
-    else                                                                \
-        asm volatile(instr " %1, %2, 0 offen offset:%3 lds"             \
-                     : "=r"(smem) /*dummy dependency for smem*/         \
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)            \
+    if constexpr(pre_nop)
+        asm volatile("s_nop 4\n"
+                     "buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
                     : "memory");
-
-    if constexpr(num_dwords == 1)
-    {
-        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword");
-    }
-#if defined(__gfx950__)
-    else if constexpr(num_dwords == 3)
-    {
-        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3");
-    }
-    else if constexpr(num_dwords == 4)
-    {
-        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4");
-    }
-#endif
    else
-    {
-        static_assert(false, "wrong! not implemented data width");
-    }
-#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
+        asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+                     : "memory");
 }

 CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
@@ -1334,17 +1315,6 @@ enum struct amd_buffer_coherence_enum
    glc               = 1,
    slc               = 2,
    glc_slc           = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
-    DEVICE_NT1 = 10,
-    SYSTEM_NT0 = 9,
-    SYSTEM_NT1 = 11,
 };

 template <index_t N,
@@ -1779,25 +1749,22 @@ template <typename T,
          index_t N,
          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
          bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_async_buffer_load_impl(CK_TILE_LDS_ADDR T* smem,
+CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
                                               int32x4_t src_wave_buffer_resource,
                                               index_t src_thread_addr_offset,
                                               index_t src_wave_addr_offset,
                                               index_t src_immediate_addr_offset = 0,
                                               bool_constant<pre_nop>            = {})
 {
-    constexpr index_t num_bytes = sizeof(T) * N;
-    constexpr index_t num_words = num_bytes / 4;
-    static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
-                  "wrong! only support in dword, dwordx3, dwordx4");
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");

-    async_buffer_load_dwordxn_v<num_words>(smem,
-                                           src_wave_buffer_resource,
-                                           src_thread_addr_offset,
-                                           src_wave_addr_offset,
-                                           src_immediate_addr_offset,
-                                           0,
-                                           bool_constant<pre_nop>{});
+    async_buffer_load_dword_v(smem,
+                              src_wave_buffer_resource,
+                              src_thread_addr_offset,
+                              src_wave_addr_offset,
+                              src_immediate_addr_offset,
+                              0,
+                              bool_constant<pre_nop>{});
 }

 template <typename T,
@@ -1812,38 +1779,29 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
                                          index_t flag                         = 0,
                                          bool_constant<oob_conditional_check> = {})
 {
-    constexpr index_t bytes = sizeof(T) * N;
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");

-    // Used to catch the cases when src_immediate_addr_offset is NOT 0.
-    // Remove this assert once other sizes are implemented.
-    assert(src_immediate_addr_offset == 0 &&
-           "wrong! not implemented src_immediate_addr_offset size, only 0 supported");
-    ignore = src_immediate_addr_offset;
-
-#if defined(__gfx950__)
-    static_assert(bytes == 4 || bytes == 12 || bytes == 16,
-                  "wrong! only support in dword, dwordx3, dwordx4");
-    src_wave_addr_offset = 0;
-#else
-    static_assert(bytes == 4, "wrong! not implemented vector size");
-#endif
-
-    // Set up v_offset:
-    index_t v_offset = src_thread_addr_offset;
    if constexpr(oob_conditional_check)
-        v_offset = flag ? v_offset : src_wave_buffer_resource[2];
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
-    // Use C-style cast to change address space without dropping llvm noalias attribute
-    llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
-                                    (as3_uint32_ptr)(smem),
-                                    bytes,
-                                    v_offset,
-                                    src_wave_addr_offset,
-                                    /*src_immediate_addr_offset*/ 0,
-                                    static_cast<index_t>(coherence));
-#pragma clang diagnostic pop
+    {
+        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        v_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+    else
+    {
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        src_thread_addr_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
 }

 template <index_t N,
@@ -2787,47 +2745,44 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 #endif
 }

-#if defined(__gfx950__)
-template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
+template <typename T, index_t NumElemsPerThread>
+CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
+                                                  const index_t global_offset,
+                                                  T* lds_base_ptr,
+                                                  const index_t lds_offset,
+                                                  const bool is_valid,
+                                                  const index_t src_element_space_size)
 {
-#define __LDS_ADDR __attribute__((address_space(3)))
+    // Direct loads require that each thread reads and writes exactly a single DWORD.
+    constexpr auto dword_bytes      = 4;
+    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+    static_assert(bytes_per_thread == dword_bytes);

-    static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
-                  "We need to have the compatible compiler version to build this instruction");
+    const uint32_t* global_ptr =
+        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
+    const int32x4_t src_resource =
+        make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
+    const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;

-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
-    // Use C-style cast to change address space without dropping llvm noalias attribute
-    const auto in_ptr_ = (__LDS_ADDR T*)(const_cast<T*>(in_ptr));
-#pragma clang diagnostic pop
-    if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
-    {
-        typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
-        auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*>(in_ptr_);
-        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr));
-    }
-    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::bf16_t>)
-    {
-        typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
-        auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*>(in_ptr_);
-        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
-    }
-    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t> ||
-                      std::is_same_v<remove_cvref_t<T>, ck_tile::bf8_t> ||
-                      std::is_same_v<remove_cvref_t<T>, ck_tile::int8_t>)
-    {
-        typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t;
-        auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_i32x2_t*>(in_ptr_);
-        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
-    }
-    else
-    {
-        static_assert(false, "not implemented");
-    }
-#undef __LDS_ADDR
-}
+#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
+    T* lds_ptr = lds_base_ptr + lds_offset;
+    auto const lds_ptr_sgpr =
+        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
+    asm volatile("s_mov_b32 m0, %0; \n\t"
+                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
+                 "v"(global_offset_bytes),
+                 "s"(src_resource)
+                 : "memory");
+#else
+    // LDS pointer must be attributed with the LDS address space.
+    __attribute__((address_space(3))) uint32_t* lds_ptr =
+        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
+            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
+
+    llvm_amdgcn_raw_buffer_load_lds(
+        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
 #endif
+}

 } // namespace ck_tile

--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -13,9 +13,6 @@
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
-#include "ck_tile/core/utility/ignore.hpp"
-
-using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;

 namespace ck_tile {

@@ -32,6 +29,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
    buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
    int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    r.x         = __builtin_amdgcn_readfirstlane(r.x);
+    r.y         = __builtin_amdgcn_readfirstlane(r.y);
+    r.z         = __builtin_amdgcn_readfirstlane(r.z);
+    r.w         = __builtin_amdgcn_readfirstlane(r.w);
    return r;
 }

@@ -1137,53 +1138,33 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
 // Direct loads from global to LDS.
 CK_TILE_DEVICE_EXTERN void
 llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
-                                as3_uint32_ptr lds_ptr,
+                                __attribute__((address_space(3))) uint32_t* lds_ptr,
                                index_t size,
                                index_t voffset,
                                index_t soffset,
                                index_t offset,
                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");

-template <unsigned num_dwords, bool pre_nop = false>
-CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem,
-                                                int32x4_t rsrc,
-                                                index_t voffset,
-                                                index_t /*soffset*/,
-                                                index_t ioffset /*max 0xFFF*/,
-                                                index_t /*flag*/       = 0,
-                                                bool_constant<pre_nop> = {})
+template <bool pre_nop = false>
+CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
+                                              int32x4_t rsrc,
+                                              index_t voffset,
+                                              index_t /*soffset*/,
+                                              index_t ioffset /*max 0xFFF*/,
+                                              index_t /*flag*/       = 0,
+                                              bool_constant<pre_nop> = {})
 {
-#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr)                            \
-    if constexpr(pre_nop)                                               \
-        asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
-                     : "=r"(smem) /*dummy dependency for smem*/         \
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)            \
-                     : "memory");                                       \
-    else                                                                \
-        asm volatile(instr " %1, %2, 0 offen offset:%3 lds"             \
-                     : "=r"(smem) /*dummy dependency for smem*/         \
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)            \
+    if constexpr(pre_nop)
+        asm volatile("s_nop 4\n"
+                     "buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
                     : "memory");
-
-    if constexpr(num_dwords == 1)
-    {
-        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword");
-    }
-#if defined(__gfx950__)
-    else if constexpr(num_dwords == 3)
-    {
-        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3");
-    }
-    else if constexpr(num_dwords == 4)
-    {
-        CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4");
-    }
-#endif
    else
-    {
-        static_assert(false, "wrong! not implemented data width");
-    }
-#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
+        asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+                     : "memory");
 }

 CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
@@ -1202,17 +1183,6 @@ enum struct amd_buffer_coherence_enum
    glc               = 1,
    slc               = 2,
    glc_slc           = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
-    DEVICE_NT1 = 10,
-    SYSTEM_NT0 = 9,
-    SYSTEM_NT1 = 11,
 };

 template <index_t N,
@@ -1556,18 +1526,15 @@ CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
                                               index_t src_immediate_addr_offset = 0,
                                               bool_constant<pre_nop>            = {})
 {
-    constexpr index_t num_bytes = sizeof(T) * N;
-    constexpr index_t num_words = num_bytes / 4;
-    static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
-                  "wrong! only support in dword, dwordx3, dwordx4");
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");

-    async_buffer_load_dwordxn_v<num_words>(smem,
-                                           src_wave_buffer_resource,
-                                           src_thread_addr_offset,
-                                           src_wave_addr_offset,
-                                           src_immediate_addr_offset,
-                                           0,
-                                           bool_constant<pre_nop>{});
+    async_buffer_load_dword_v(smem,
+                              src_wave_buffer_resource,
+                              src_thread_addr_offset,
+                              src_wave_addr_offset,
+                              src_immediate_addr_offset,
+                              0,
+                              bool_constant<pre_nop>{});
 }

 template <typename T,
@@ -1582,38 +1549,29 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
                                          index_t flag                         = 0,
                                          bool_constant<oob_conditional_check> = {})
 {
-    constexpr index_t bytes = sizeof(T) * N;
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");

-    // Used to catch the cases when src_immediate_addr_offset is NOT 0.
-    // Remove this assert once other sizes are implemented.
-    assert(src_immediate_addr_offset == 0 &&
-           "wrong! not implemented src_immediate_addr_offset size, only 0 supported");
-    ignore = src_immediate_addr_offset;
-
-#if defined(__gfx950__)
-    static_assert(bytes == 4 || bytes == 12 || bytes == 16,
-                  "wrong! only support in dword, dwordx3, dwordx4");
-    src_wave_addr_offset = 0;
-#else
-    static_assert(bytes == 4, "wrong! not implemented vector size");
-#endif
-
-    // Set up v_offset:
-    index_t v_offset = src_thread_addr_offset;
    if constexpr(oob_conditional_check)
-        v_offset = flag ? v_offset : src_wave_buffer_resource[2];
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
-    // Use C-style cast to change address space without dropping llvm noalias attribute
-    llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
-                                    (as3_uint32_ptr)(smem),
-                                    bytes,
-                                    v_offset,
-                                    src_wave_addr_offset,
-                                    /*src_immediate_addr_offset*/ 0,
-                                    static_cast<index_t>(coherence));
-#pragma clang diagnostic pop
+    {
+        index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2];
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        v_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+    else
+    {
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        src_thread_addr_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
 }

 template <index_t N,
@@ -2565,6 +2523,11 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                                  const bool is_valid,
                                                  const index_t src_element_space_size)
 {
+    // Direct loads require that each thread reads and writes exactly a single DWORD.
+    constexpr auto dword_bytes      = 4;
+    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+    static_assert(bytes_per_thread == dword_bytes);
+
    const uint32_t* global_ptr =
        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
    const int32x4_t src_resource =
@@ -2581,72 +2544,16 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                 "s"(src_resource)
                 : "memory");
 #else
-    // Direct loads require that each thread reads and writes exactly a single DWORD.
-#if defined(__gfx9__)
-    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
-#endif
-    // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
-    // For gfx950: supports 1, 3, or 4 DWORDs per thread
-    // For gfx942: supports exactly 1 DWORD per thread
-#if defined(__gfx950__)
-    constexpr auto dword_bytes = 4;
-    static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
-                  bytes_per_thread == dword_bytes * 4);
-#elif defined(__gfx9__)
-    constexpr auto dword_bytes = 4;
-    static_assert(bytes_per_thread == dword_bytes);
-#endif
    // LDS pointer must be attributed with the LDS address space.
-    as3_uint32_ptr lds_ptr =
-        reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
+    __attribute__((address_space(3))) uint32_t* lds_ptr =
+        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
+            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));

    llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
+        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
 #endif
 }

-#if defined(__gfx950__)
-template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
-{
-#define __LDS_ADDR __attribute__((address_space(3)))
-
-    static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
-                  "We need to have the compatible compiler version to build this instruction");
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
-    // Use C-style cast to change address space without dropping llvm noalias attribute
-    const auto in_ptr_ = (__LDS_ADDR T*)(const_cast<T*>(in_ptr));
-#pragma clang diagnostic pop
-    if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
-    {
-        typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
-        auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*>(in_ptr_);
-        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr));
-    }
-    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::bf16_t>)
-    {
-        typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
-        auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*>(in_ptr_);
-        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
-    }
-    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t> ||
-                      std::is_same_v<remove_cvref_t<T>, ck_tile::bf8_t> ||
-                      std::is_same_v<remove_cvref_t<T>, ck_tile::int8_t>)
-    {
-        typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t;
-        auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_i32x2_t*>(in_ptr_);
-        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
-    }
-    else
-    {
-        static_assert(false, "not implemented");
-    }
-#undef __LDS_ADDR
-}
-#endif
-
 } // namespace ck_tile

 #endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN