experimenting global and buffer load/store

2026-05-24 14:54:47 +00:00 · 2019-09-18 01:37:28 -05:00
parent 9f46cdf5fa
commit c7a6545ec4
5 changed files with 223 additions and 80 deletions
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -8,21 +8,169 @@ namespace ck {
 // cast a pointer of LDS to its address
 extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);

-// buffer_load and buffer_store
+// global_load and global_store
 template <typename T, index_t VectorSize>
 __device__ typename vector_type<T, VectorSize>::MemoryType
-buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
+__global_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);

 template <typename T, index_t VectorSize>
-__device__ void buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
-                             T* p_dst_block,
-                             uint32_t dst_thread_offset,
-                             uint32_t dst_const_offset);
+__device__ void __global_store(const typename vector_type<T, VectorSize>::MemoryType& src,
+                               T* p_dst_block,
+                               uint32_t dst_thread_offset,
+                               uint32_t dst_const_offset);

 template <>
-__device__ float buffer_load<float, 1>(const float* p_src_block,
-                                       uint32_t src_thread_offset,
-                                       uint32_t src_const_offset)
+__device__ float __global_load<float, 1>(const float* p_src_block,
+                                         uint32_t src_thread_offset,
+                                         uint32_t src_const_offset)
+{
+#if 0 // compute on VALU
+    float dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
+
+    asm volatile("\n \
+     global_load_dword %0, %1, %2, offset:0 \n \
+     s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block));
+
+    return dst;
+#else // compute on SALU
+    float dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
+
+    const float* p_src_block_with_offset = p_src_block + src_const_offset;
+
+    asm volatile("\n \
+     global_load_dword %0, %1, %2, offset:0 \n \
+     ;;s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
+
+    return dst;
+#endif
+}
+
+template <>
+__device__ vector_type<float, 2>::MemoryType __global_load<float, 2>(const float* p_src_block,
+                                                                     uint32_t src_thread_offset,
+                                                                     uint32_t src_const_offset)
+{
+#if 0 // compute on VALU
+    vector_type<float, 2>::MemoryType dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
+
+    asm volatile("\n \
+     global_load_dwordx2 %0, %1, %2, offset:0 \n \
+     s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block));
+
+    return dst;
+#else // compute on SALU
+    vector_type<float, 2>::MemoryType dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
+
+    const float* p_src_block_with_offset = p_src_block + src_const_offset;
+
+    asm volatile("\n \
+     global_load_dwordx2 %0, %1, %2, offset:0 \n \
+     ;;s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
+
+    return dst;
+#endif
+}
+
+template <>
+__device__ vector_type<float, 4>::MemoryType __global_load<float, 4>(const float* p_src_block,
+                                                                     uint32_t src_thread_offset,
+                                                                     uint32_t src_const_offset)
+{
+#if 0 // compute on VALU
+    vector_type<float, 4>::MemoryType dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
+
+    asm volatile("\n \
+     global_load_dwordx4 %0, %1, %2, offset:0 \n \
+     s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block));
+
+    return dst;
+#else // compute on SALU
+    vector_type<float, 4>::MemoryType dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
+
+    const float* p_src_block_with_offset = p_src_block + src_const_offset;
+
+    asm volatile("\n \
+     global_load_dwordx4 %0, %1, %2, offset:0 \n \
+     ;;s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
+
+    return dst;
+#endif
+}
+
+template <>
+__device__ void __global_store<float, 1>(const float& src,
+                                         float* p_dst_block,
+                                         uint32_t dst_thread_offset,
+                                         uint32_t dst_const_offset)
+{
+#if 0 // compute on VALU
+    uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset + dst_const_offset);
+
+    asm volatile("\n \
+     global_store_dword %0, %1, %2, offset:0 \n \
+     s_waitcnt 0 \n \
+     "
+                 :
+                 : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block));
+#else // compute on SALU
+    uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset);
+
+    float* p_dst_block_with_offset = p_dst_block + dst_const_offset;
+
+    asm volatile("\n \
+     global_store_dword %0, %1, %2, offset:0 \n \
+     ;;s_waitcnt 0 \n \
+     "
+                 :
+                 : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block_with_offset));
+#endif
+}
+
+// __buffer_load and __buffer_store
+template <typename T, index_t VectorSize>
+__device__ typename vector_type<T, VectorSize>::MemoryType
+__buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
+
+template <typename T, index_t VectorSize>
+__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
+                               T* p_dst_block,
+                               uint32_t dst_thread_offset,
+                               uint32_t dst_const_offset);
+
+template <>
+__device__ float __buffer_load<float, 1>(const float* p_src_block,
+                                         uint32_t src_thread_offset,
+                                         uint32_t src_const_offset)
 {
    float dst;

@@ -35,7 +183,7 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
-    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
+    __buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
    s_waitcnt 0 \n \
    "
                 : "=v"(dst)
@@ -45,9 +193,9 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
 }

 template <>
-__device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float* p_src_block,
-                                                                   uint32_t src_thread_offset,
-                                                                   uint32_t src_const_offset)
+__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float* p_src_block,
+                                                                     uint32_t src_thread_offset,
+                                                                     uint32_t src_const_offset)
 {
    vector_type<float, 2>::MemoryType dst;

@@ -60,7 +208,7 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
-    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
+    __buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
    s_waitcnt 0 \n \
    "
                 : "=v"(dst)
@@ -70,9 +218,9 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
 }

 template <>
-__device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float* p_src_block,
-                                                                   uint32_t src_thread_offset,
-                                                                   uint32_t src_const_offset)
+__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float* p_src_block,
+                                                                     uint32_t src_thread_offset,
+                                                                     uint32_t src_const_offset)
 {
    vector_type<float, 4>::MemoryType dst;

@@ -85,7 +233,7 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;

    asm volatile("\n \
-    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
+    __buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
    s_waitcnt 0 \n \
    "
                 : "=v"(dst)
@@ -95,10 +243,10 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
 }

 template <>
-__device__ void buffer_store<float, 1>(const float& src,
-                                       float* p_dst_block,
-                                       uint32_t dst_thread_offset,
-                                       uint32_t dst_const_offset)
+__device__ void __buffer_store<float, 1>(const float& src,
+                                         float* p_dst_block,
+                                         uint32_t dst_thread_offset,
+                                         uint32_t dst_const_offset)
 {
    int32x4_t dst_block_setting{0};
    // fill in byte 0 - 1
@@ -109,7 +257,7 @@ __device__ void buffer_store<float, 1>(const float& src,
    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;

    asm volatile("\n \
-    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
+    __buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
    s_waitcnt 0 \n \
    "
                 :