diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 0ea091e607..1607a9e802 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -592,7 +592,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 1x1 filter, 14x14 image, C = 512 constexpr index_t N = 128; constexpr index_t C = 512; diff --git a/src/include/inline_asm.hpp b/src/include/amd_inline_asm.hip.hpp similarity index 87% rename from src/include/inline_asm.hpp rename to src/include/amd_inline_asm.hip.hpp index f7b1f217cb..38ac970981 100644 --- a/src/include/inline_asm.hpp +++ b/src/include/amd_inline_asm.hip.hpp @@ -1,8 +1,5 @@ #pragma once - -typedef float Float4 __attribute__((ext_vector_type(4))); - -extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; +#include "common.hip.hpp" #define NO_VM_WAIT 0 #define NO_LGKM_WAIT 0 @@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; #define NO_DS_WRITE 0 #define NO_GLB_READ 0 -inline __device__ void vmcnt(index_t cnt) +// cast a pointer of LDS to its address +extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; + +__device__ void vmcnt(index_t cnt) { #if !NO_VM_WAIT if(cnt == 0) @@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt) #endif } -inline __device__ void lgkmcnt(index_t cnt) +__device__ void lgkmcnt(index_t cnt) { #if !NO_LGKM_WAIT if(cnt == 0) @@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt) #endif } -inline __device__ void outerProduct1x4(const float* a, const float* b, float* c) +__device__ void outerProduct1x4(const float* a, const float* b, float* c) { asm volatile("\n \ v_mac_f32 %0, %4, %5 \n \ @@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c) "3"(c[3])); } -inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4& c) +__device__ void outerProduct1x4(const float& a, + const vector_type::MemoryType& b, + vector_type::MemoryType& c) { #if 0 asm volatile( @@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4& #endif } -inline __device__ void -outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4& c2, Float4& c3) +__device__ void outerProduct4x4(const vector_type::MemoryType& a, + const vector_type::MemoryType& b, + vector_type::MemoryType& c0, + vector_type::MemoryType& c1, + vector_type::MemoryType& c2, + vector_type::MemoryType& c3) { #if 0 asm volatile( @@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4 #endif } -inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* c) +__device__ void outerProduct8x8(const vector_type::MemoryType* a, + const vector_type::MemoryType* b, + vector_type::MemoryType* c) { outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]); outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]); @@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]); } -inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0) +__device__ void ds_read_b128(vector_type::MemoryType& r, void* lds, index_t offset = 0) { #if !NO_DS_READ if(offset == 0) @@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0) #endif } -inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0) +__device__ void global_load(vector_type::MemoryType& r, + const vector_type::MemoryType* ptr, + index_t offset = 0) { #if !NO_GLB_READ if(offset == 0) @@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset #endif } -inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0) +__device__ void +ds_write_b128(const vector_type::MemoryType& r, void* lds, index_t offset = 0) { #if !NO_DS_WRITE if(offset == 0) diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 7f25a7322f..1ebc780bf8 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 const FloatB* __restrict__ p_b_block, FloatC* __restrict__ p_c_thread) const { + static_assert(is_same::value && is_same::value && + is_same::value, + "Run_asm only deal with float\n"); + constexpr auto True = integral_constant{}; constexpr auto False = integral_constant{}; @@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( Number{}, Number{}, Number{}); + static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && + MPerThread == 8 && NPerThread == 8, + "Run_asm cannot deal with this GEMM shape yet\n"); + + using Float4 = vector_type::MemoryType; + float p_thread[a_thread_mtx.GetElementSpace() + b_thread_mtx.GetElementSpace()]; FloatA* p_a_thread = p_thread; diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 1c41d04feb..feb9060be7 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -5,8 +5,8 @@ #include "Array.hip.hpp" #include "functional.hip.hpp" -#if DEVICE_BACKEDN_HIP -#include "inline_asm.hpp" +#if DEVICE_BACKEND_HIP +#include "amd_inline_asm.hip.hpp" #endif __device__ index_t get_thread_local_1d_id() { return threadIdx.x; } @@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs) return x < y ? x : y; } } // namespace mod_conv - -#if DEVICE_BACKEND_HIP -// cast a pointer of LDS to its address -extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; -#endif diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index ee44eebbe7..31791542ae 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -16,12 +16,14 @@ template <> struct vector_type { #if DEVICE_BACKEND_HIP - // For some reason, HIP compiler need this definition to generate optimal load and store instruction + // For some reason, HIP compiler need this definition to generate optimal load and store + // instruction typedef float MemoryType __attribute__((ext_vector_type(2))); #elif DEVICE_BACKEND_CUDA // For some reason, CUDA need this definition to, otherwise // compiler won't generate optimal load and store instruction, and - // kernel would produce wrong result, indicating the compiler fail to generate correct instruction, + // kernel would produce wrong result, indicating the compiler fail to generate correct + // instruction, using MemoryType = float2; #endif @@ -43,12 +45,14 @@ template <> struct vector_type { #if DEVICE_BACKEND_HIP - // For some reason, HIP compiler need this definition to generate optimal load and store instruction + // For some reason, HIP compiler need this definition to generate optimal load and store + // instruction typedef float MemoryType __attribute__((ext_vector_type(4))); #elif DEVICE_BACKEND_CUDA // For some reason, CUDA need this definition to, otherwise // compiler won't generate optimal load and store instruction, and - // kernel would produce wrong result, indicating the compiler fail to generate correct instruction, + // kernel would produce wrong result, indicating the compiler fail to generate correct + // instruction, using MemoryType = float4; #endif }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index acd4df9662..64d3c03970 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -239,9 +239,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn #elif 0 blockwise_gemm.Run_asm #endif - (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block + y * Wi + x, - p_out_thread); + (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block + y * Wi + x, + p_out_thread); } } } diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index f1cd81b32b..7c867b620f 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -34,11 +34,6 @@ template struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { - __host__ - __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer() - { - } - __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const @@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer // preload data into LDS { -#if 1 Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; @@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, p_wei_register_clipboard); - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double); +#if 0 + blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double); -#elif 0 - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); - + p_wei_block_double); +#elif 1 vmcnt(0); blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, p_in_block_double); @@ -278,28 +264,26 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 1 +#if 0 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 1 blockwise_gemm.Run_asm #endif - (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_now + y * Wi + x, - p_out_thread); + (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_now + y * Wi + x, + p_out_thread); } } -#if 1 +#if 0 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next); -#elif 0 - // if work with RunLoadRegisterClipboard_asm, need to wait +#elif 1 vmcnt(0); - blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, p_in_block_next); blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, @@ -329,32 +313,29 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 1 +#if 0 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 1 blockwise_gemm.Run_asm #endif - (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_double + y * Wi + x, - p_out_thread); + (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_double + y * Wi + x, + p_out_thread); } } -#if 1 +#if 0 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double + in_block_space); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_double + wei_block_space); #else - // if work with RunLoadRegisterClipboard_asm, need to wait vmcnt(0); - blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, p_wei_block_double + wei_block_space); #endif @@ -366,17 +347,17 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 1 +#if 0 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 1 blockwise_gemm.Run_asm #endif - (p_wei_block_double + wei_block_space + - wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_double + in_block_space + y * Wi + x, - p_out_thread); + (p_wei_block_double + wei_block_space + + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_double + in_block_space + y * Wi + x, + p_out_thread); } } }