From 3c0646d490b59e4f19763de5e59e2974a2801f72 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 17 Jun 2019 17:28:24 -0500 Subject: [PATCH] bring back some inline asm --- .../include/utility/amd_inline_asm.hpp | 73 +++++++++++++++++++ .../include/utility/config_amd.hpp.in | 15 ---- 2 files changed, 73 insertions(+), 15 deletions(-) diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 6400fb708f..c4643543fc 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -5,6 +5,79 @@ namespace ck { +// cast a pointer of LDS to its address +extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; + +__device__ void vmcnt(index_t cnt) +{ + if(cnt == 0) + { + asm volatile("\n \ + s_waitcnt vmcnt(0) \n \ + " ::); + } + else if(cnt == 1) + { + asm volatile("\n \ + s_waitcnt vmcnt(1) \n \ + " ::); + } + else if(cnt == 2) + { + asm volatile("\n \ + s_waitcnt vmcnt(2) \n \ + " ::); + } + else if(cnt == 4) + { + asm volatile("\n \ + s_waitcnt vmcnt(2) \n \ + " ::); + } + else + { + assert(false); + } +} + +__device__ void lgkmcnt(index_t cnt) +{ + if(cnt == 0) + { + asm volatile("\n \ + s_waitcnt lgkmcnt(0) \n \ + " ::); + } + else if(cnt == 1) + { + asm volatile("\n \ + s_waitcnt lgkmcnt(1) \n \ + " ::); + } + else if(cnt == 2) + { + asm volatile("\n \ + s_waitcnt lgkmcnt(2) \n \ + " ::); + } + else if(cnt == 3) + { + asm volatile("\n \ + s_waitcnt lgkmcnt(3) \n \ + " ::); + } + else if(cnt == 4) + { + asm volatile("\n \ + s_waitcnt lgkmcnt(4) \n \ + " ::); + } + else + { + assert(false); + } +} + __device__ void outerProduct1x4(const float* a, const float* b, float* c) { asm volatile("\n \ diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index babdbc47cf..404a9853c3 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -24,21 +24,6 @@ __device__ void fused_multiply_accumulate(float& d, const float& s0, const float d += s0 * s1; } -#if 0 -__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } - -__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) -{ - d += s0.x * s1.x; - d += s0.y * s1.y; -} - -__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) -{ - d += s0.x * s1.x + s0.y * s1.y; -} -#endif - } // namespace ck #endif