From dabfa77fc68bdbcfc2d10cdd515605c053cf1ed0 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 5 Apr 2019 02:13:29 -0500 Subject: [PATCH] clipboard float4 copy and paste C++ code --- ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 7 +- src/include/blockwise_2d_tensor_op.hip.hpp | 131 ++++++++++++- src/include/data_type.hip.hpp | 12 +- ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 180 +++++++----------- src/include/inline_asm.hpp | 46 +++-- src/include/threadwise_gemm.hip.hpp | 25 ++- 6 files changed, 246 insertions(+), 155 deletions(-) diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index 1492d22a16..0612052e00 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -238,9 +238,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, constexpr index_t GemmNLevel1Cluster = 4; constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmThreadPerColumnPerCluster = 8; - constexpr index_t GemmThreadPerRowPerCluster = 8; - constexpr index_t InBlockCopyThreadPerDim0 = 4; constexpr index_t InBlockCopyThreadPerDim1 = 16; @@ -272,7 +269,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, for(index_t i = 0; i < nrepeat; ++i) { constexpr auto gridwise_conv = -#if 1 +#if 0 GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn #else GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer @@ -288,8 +285,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, CPerBlock, BPerThread, KPerThread, - GemmThreadPerColumnPerCluster, - GemmThreadPerRowPerCluster, GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster, diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 5a29f94712..2bbf0edd30 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -1,5 +1,6 @@ #pragma once #include "ConstantTensorDescriptor.hip.hpp" +#include "inline_asm.hpp" template __device__ void @@ -517,9 +518,9 @@ struct Blockwise2dTensorCopy3 constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; auto f_copy = [&](index_t iloop) { - *(reinterpret_cast(p_clipboard + iloop * 4)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - iloop * src_loop_stride)); + *(reinterpret_cast(&p_clipboard[iloop * DataPerRead])) = + *(reinterpret_cast( + &p_src[mSrcMyThreadOffset + iloop * src_loop_stride])); }; for(index_t iloop = 0; iloop < nloop_d0; ++iloop) @@ -568,8 +569,8 @@ struct Blockwise2dTensorCopy3 constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; auto f_copy = [&](index_t iloop) { - *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = - *(reinterpret_cast(p_clipboard + iloop * 4)); + *(reinterpret_cast(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride])) = + *(reinterpret_cast(&p_clipboard[iloop * DataPerRead])); }; for(index_t iloop = 0; iloop < nloop_d0; ++iloop) @@ -589,4 +590,124 @@ struct Blockwise2dTensorCopy3 } } } + +#if DEVICE_BACKEND_HIP + __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src, + Float* p_clipboard) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + constexpr index_t L0 = CopyLengths{}.Get(I0); + constexpr index_t L1 = CopyLengths{}.Get(I1); + + constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead; + constexpr index_t thread_per_d0 = BlockSize / thread_per_d1; + + constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1; + + if(BlockSize > num_active_thread) + { + if(get_thread_local_1d_id() >= num_active_thread) + { + return; + } + } + + constexpr index_t nloop_d0 = L0 / thread_per_d0; + + constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0; + constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; + + auto f_copy = [&](index_t iloop) { +#if 0 + *(reinterpret_cast(&p_clipboard[iloop * DataPerRead])) = + *(reinterpret_cast(&p_src[mSrcMyThreadOffset + + iloop * src_loop_stride])); +#else + static_assert(is_same::value && DataPerRead == 4, + "global_load is only for float4"); + + global_load(reinterpret_cast(p_clipboard[iloop * DataPerRead]), + reinterpret_cast( + &p_src[mSrcMyThreadOffset + iloop * src_loop_stride])); +#endif + }; + + for(index_t iloop = 0; iloop < nloop_d0; ++iloop) + { + f_copy(iloop); + } + + constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0); + + if(has_tail_d0) + { + constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0; + + if(get_thread_local_1d_id() < tail_d0 * thread_per_d1) + { + f_copy(nloop_d0); + } + } + } + + __device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard, + Float* __restrict__ p_dst) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + constexpr index_t L0 = CopyLengths{}.Get(I0); + constexpr index_t L1 = CopyLengths{}.Get(I1); + + constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead; + constexpr index_t thread_per_d0 = BlockSize / thread_per_d1; + + constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1; + + if(BlockSize > num_active_thread) + { + if(get_thread_local_1d_id() >= num_active_thread) + { + return; + } + } + + constexpr index_t nloop_d0 = L0 / thread_per_d0; + + constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0; + constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; + + auto f_copy = [&](index_t iloop) { +#if 0 + *(reinterpret_cast(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) = + *(reinterpret_cast(&p_clipboard[iloop * DataPerRead]); +#else + static_assert(is_same::value && DataPerRead == 4, + "ds_write_b128 is only for float4"); + + ds_write_b128(reinterpret_cast(p_clipboard[iloop * DataPerRead]), + &p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]); +#endif + }; + + for(index_t iloop = 0; iloop < nloop_d0; ++iloop) + { + f_copy(iloop); + } + + constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0); + + if(has_tail_d0) + { + constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0; + + if(get_thread_local_1d_id() < tail_d0 * thread_per_d1) + { + f_copy(nloop_d0); + } + } + } +#endif }; diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index 1261e19989..20d9d91fa5 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -9,13 +9,13 @@ struct vector_type template <> struct vector_type { - using MemoryType = float; + typedef float MemoryType; }; template <> struct vector_type { - using MemoryType = float2; + typedef float MemoryType __attribute__((ext_vector_type(2))); __host__ __device__ static MemoryType Pack(float s0, float s1) { @@ -34,13 +34,7 @@ struct vector_type template <> struct vector_type { - using MemoryType = float4; -}; - -template <> -struct vector_type -{ - using MemoryType = float4; + typedef float MemoryType __attribute__((ext_vector_type(4))); }; #if 0 diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index b9de731b23..be4185a6ab 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -19,8 +19,6 @@ template {}); -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc"); - print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc"); - print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc"); - - print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc"); - print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc"); - - print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); - print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc"); - print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc"); - print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); - - printf("KPerBlock %u\n", KPerBlock); - } -#endif - // blockwise in copy // formmat is [CPerBlock,BPerBlock + BGhostRead] #if 0 - const auto blockwise_in_copy = - Blockwise2dTensorCopy1{}; + const auto blockwise_in_copy = + Blockwise2dTensorCopy1{}; #elif 0 const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; + const auto blockwise_wei_copy = + Blockwise2dTensorCopy1{}; #elif 0 const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}, Number{}); -#if 0 - const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC{}; -#else const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2{}; -#endif // LDS: be careful of alignment constexpr index_t max_align = @@ -235,27 +200,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer const Float* p_wei_global_block_offset = p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); -// preload data into LDS -#if 0 - blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0); - blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0); -#else - Float4 tmp_in, tmp_wei; - Float4* glb_in_p = - (Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset); - Float4* glb_wei_p = - (Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset); - - global_load(tmp_in, glb_in_p); - global_load(tmp_wei, glb_wei_p); - - Float4* loc_in_p = (Float4*)(p_in_block_double + blockwise_in_copy.mDstMyThreadOffset); - Float4* loc_wei_p = (Float4*)(p_wei_block_double + blockwise_wei_copy.mDstMyThreadOffset); - - vmcnt(0); - ds_write_b128(tmp_in, loc_in_p); - ds_write_b128(tmp_wei, loc_wei_p); -#endif + // preload data into LDS + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double); // register Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; @@ -285,18 +232,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); // load next data -#if 0 - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; +#if 1 + Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; + Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - __syncthreads(); + __syncthreads(); - blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); + blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, + p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); -#elif 1 + blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, + p_wei_register_clipboard); +#elif 0 Float4 tmp_in, tmp_wei; Float4* glb_in_p = (Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset); @@ -316,7 +263,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer for(index_t x = 0; x < X; ++x) { #if 0 - blockwise_gemm.Run + blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 1 @@ -328,11 +275,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer } } -#if 0 - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); -#elif 1 +#if 1 + blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, + p_in_block_next); + blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, + p_wei_block_next); +#elif 0 + // if work with RunLoadRegisterClipboard_asm, need to wait + vmcnt(0); + + blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, + p_in_block_next); + blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, + p_wei_block_next); +#elif 0 Float4* loc_in_p = (Float4*)(p_in_block_next + blockwise_in_copy.mDstMyThreadOffset); Float4* loc_wei_p = @@ -352,16 +308,16 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); - Float4 tmp_in, tmp_wei; - Float4* glb_in_p = - (Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset); - Float4* glb_wei_p = - (Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset); - __syncthreads(); - global_load(tmp_in, glb_in_p); - global_load(tmp_wei, glb_wei_p); + Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; + Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + + blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, + p_in_register_clipboard); + + blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, + p_wei_register_clipboard); for(index_t y = 0; y < Y; ++y) { @@ -369,10 +325,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { #if 0 blockwise_gemm.Run -#elif 1 - blockwise_gemm.Run_asm #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer +#elif 1 + blockwise_gemm.Run_asm #endif (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), p_in_block_double + y * Wi + x, @@ -380,14 +336,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer } } - Float4* loc_in_p = (Float4*)(p_in_block_double + in_block_element_space + - blockwise_in_copy.mDstMyThreadOffset); - Float4* loc_wei_p = (Float4*)(p_wei_block_double + wei_block_element_space + - blockwise_wei_copy.mDstMyThreadOffset); +#if 1 + blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, + p_in_block_double + in_block_element_space); + blockwise_wei_copy.RunStoreRegisterClipboard( + p_wei_register_clipboard, p_wei_block_double + wei_block_element_space); +#else + // if work with RunLoadRegisterClipboard_asm, need to wait vmcnt(0); - ds_write_b128(tmp_in, loc_in_p); - ds_write_b128(tmp_wei, loc_wei_p); + + blockwise_in_copy.RunStoreRegisterClipboard_asm( + p_in_register_clipboard, p_in_block_double + in_block_element_space); + + blockwise_wei_copy.RunStoreRegisterClipboard_asm( + p_wei_register_clipboard, p_wei_block_double + wei_block_element_space); +#endif // odd __syncthreads(); @@ -398,10 +362,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { #if 0 blockwise_gemm.Run -#elif 1 - blockwise_gemm.Run_asm #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer +#elif 1 + blockwise_gemm.Run_asm #endif (p_wei_block_double + in_block_element_space + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), @@ -423,20 +387,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row; const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col; -#if 0 - if(get_block_1d_id() == 0) - { - printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n", - get_block_1d_id(), - get_thread_local_1d_id(), - matrix_c_index.row, - matrix_c_index.col, - k_data_begin, - b_data_begin, - p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]); - } -#endif - for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k) { for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b) diff --git a/src/include/inline_asm.hpp b/src/include/inline_asm.hpp index b53d6ed316..f7b1f217cb 100644 --- a/src/include/inline_asm.hpp +++ b/src/include/inline_asm.hpp @@ -10,7 +10,7 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; #define NO_DS_WRITE 0 #define NO_GLB_READ 0 -inline __device__ void vmcnt(int cnt) +inline __device__ void vmcnt(index_t cnt) { #if !NO_VM_WAIT if(cnt == 0) @@ -39,12 +39,12 @@ inline __device__ void vmcnt(int cnt) } else { - assert(0); + assert(false); } #endif } -inline __device__ void lgkmcnt(int cnt) +inline __device__ void lgkmcnt(index_t cnt) { #if !NO_LGKM_WAIT if(cnt == 0) @@ -79,7 +79,7 @@ inline __device__ void lgkmcnt(int cnt) } else { - assert(0); + assert(false); } #endif } @@ -187,7 +187,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]); } -inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0) +inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0) { #if !NO_DS_READ if(offset == 0) @@ -408,29 +408,43 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0) } else { - assert(0); + assert(false); } #endif } -inline __device__ void global_load(Float4& r, Float4* ptr) +inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0) { #if !NO_GLB_READ - asm volatile("\n \ - global_load_dwordx4 %0, %1, off \n \ - " - : "=v"(r) - : "v"(ptr)); + if(offset == 0) + { + asm volatile("\n \ + global_load_dwordx4 %0, %1, off \n \ + " + : "=v"(r) + : "v"(ptr)); + } + else + { + assert(false); + } #endif } -inline __device__ void ds_write_b128(Float4& r, void* lds, int offset = 0) +inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0) { #if !NO_DS_WRITE - asm volatile("\n \ + if(offset == 0) + { + asm volatile("\n \ ds_write_b128 %0, %1 \n \ " - : - : "v"(__to_local(lds)), "v"(r)); + : + : "v"(__to_local(lds)), "v"(r)); + } + else + { + assert(false); + } #endif } diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp index 79fc1bf699..0797fe0b8c 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hip.hpp @@ -14,12 +14,29 @@ __device__ void threadwise_matrix_copy(SrcMatrix, for(index_t i = 0; i < NRow; ++i) { - for(index_t j = 0; j < NCol; ++j) + // optimize for vector-4 load + if(NCol % 4 == 0) { - const index_t src_index = src_mtx.Get1dIndex(i, j); - const index_t dst_index = dst_mtx.Get1dIndex(i, j); + using vector_t = typename vector_type::MemoryType; - p_dst[dst_index] = p_src[src_index]; + for(index_t j = 0; j < NCol / 4; ++j) + { + const index_t src_index = src_mtx.Get1dIndex(i, 4 * j); + const index_t dst_index = dst_mtx.Get1dIndex(i, 4 * j); + + *reinterpret_cast(&p_dst[dst_index]) = + *reinterpret_cast(&p_src[src_index]); + } + } + else + { + for(index_t j = 0; j < NCol; ++j) + { + const index_t src_index = src_mtx.Get1dIndex(i, j); + const index_t dst_index = dst_mtx.Get1dIndex(i, j); + + p_dst[dst_index] = p_src[src_index]; + } } } }