From 75ca00f748390dec113981fd8ad71418a85e6dd6 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 9 Apr 2019 18:07:36 -0500 Subject: [PATCH] tidy yp [ROCm/composable_kernel commit: 471830a052b2ed6135ad4c41244f0ec9057c0f09] --- ...icit_gemm_convolution_1_chwn_cyxk_khwn.hpp | 62 +++++++++---------- src/include/ConstantTensorDescriptor.hip.hpp | 6 +- src/include/blockwise_4d_tensor_op.hip.hpp | 11 ++-- ...on_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp | 26 +++----- ...1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 43 +++++-------- ..._implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp | 52 +++++++--------- src/include/threadwise_gemm.hip.hpp | 9 ++- 7 files changed, 98 insertions(+), 111 deletions(-) diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp index 6f6df1b0eb..93e53d304c 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp @@ -346,37 +346,37 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, #elif 0 GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer #endif - , - InBlockCopyDataPerRead, - WeiBlockCopyDataPerRead, - OutThreadCopyDataPerWrite>{}; + , + InBlockCopyDataPerRead, + WeiBlockCopyDataPerRead, + OutThreadCopyDataPerWrite>{}; float time = launch_kernel(run_gridwise_convolution, dim3(GridSize), diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 411e46f83f..f4d95dfccb 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -381,7 +381,8 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) constexpr auto I7 = Number<7>{}; constexpr auto I8 = Number<8>{}; - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u %u}\n", + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u " + "%u}\n", s, desc.GetDimension(), desc.GetLength(I0), @@ -416,7 +417,8 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) constexpr auto I8 = Number<8>{}; constexpr auto I9 = Number<9>{}; - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u %u %u}\n", + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u " + "%u %u %u}\n", s, desc.GetDimension(), desc.GetLength(I0), diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 6586ea6250..8e26c9b7ca 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -577,8 +577,8 @@ struct Blockwise4dTensorCopy3 iloop_d3 * thread_per_d3 * DataPerRead); *(reinterpret_cast(&p_dst[dst_offset + mDstMyThreadOffset])) = - *(reinterpret_cast(&p_src[src_offset + - mSrcMyThreadOffset])); + *(reinterpret_cast( + &p_src[src_offset + mSrcMyThreadOffset])); } } } @@ -612,7 +612,8 @@ struct Blockwise4dTensorCopy3 return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3; } - __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, Float* __restrict__ p_clipboard) const + __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, + Float* __restrict__ p_clipboard) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -670,8 +671,8 @@ struct Blockwise4dTensorCopy3 iloop_d3 * thread_per_d3 * DataPerRead); *(reinterpret_cast(&p_clipboard[dst_offset])) = - *(reinterpret_cast(&p_src[src_offset + - mSrcMyThreadOffset])); + *(reinterpret_cast( + &p_src[src_offset + mSrcMyThreadOffset])); } } } diff --git a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp index 7f2bd71a49..8556281669 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp @@ -43,8 +43,9 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerThread <= NPerBlock && NPerBlock % NPerThread == 0, - "wrong! should satisfy: NPerThread <= NPerBlock && NPerBlock % NPerThread == 0"); + static_assert( + NPerThread <= NPerBlock && NPerBlock % NPerThread == 0, + "wrong! should satisfy: NPerThread <= NPerBlock && NPerBlock % NPerThread == 0"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -69,8 +70,9 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn constexpr index_t WiPerBlock = WoPerBlock + X - 1; // divide block work: [K, Ho, Wo, N] - static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, - "wrong! cannot evenly divide work for workgroup "); + static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && + Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, + "wrong! cannot evenly divide work for workgroup "); constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock; @@ -101,8 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); + Sequence{}, Number{}); constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -280,17 +281,8 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( Sequence{}); diff --git a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index a13671fa08..5b42c638e1 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -43,8 +43,9 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerThread <= NPerBlock && NPerBlock % NPerThread == 0, - "wrong! should satisfy: NPerThread <= NPerBlock && NPerBlock % NPerThread == 0"); + static_assert( + NPerThread <= NPerBlock && NPerBlock % NPerThread == 0, + "wrong! should satisfy: NPerThread <= NPerBlock && NPerBlock % NPerThread == 0"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -72,8 +73,9 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer static_assert(C % (2 * CPerBlock) == 0, "C cannot be evenly divided"); // divide block work: [K, Ho, Wo, N] - static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, - "wrong! cannot evenly divide work for workgroup "); + static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && + Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, + "wrong! cannot evenly divide work for workgroup "); constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock; @@ -104,8 +106,7 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); + Sequence{}, Number{}); constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -250,16 +251,15 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, p_wei_register_clipboard); - // a series of batched GEMM for(index_t y = 0; y < Y; ++y) { for(index_t x = 0; x < X; ++x) { - blockwise_batch_gemm.Run(p_wei_block_now + - wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_now + in_chwn_block_desc.Get1dIndex(0, y, x, 0), - p_out_thread); + blockwise_batch_gemm.Run( + p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_now + in_chwn_block_desc.Get1dIndex(0, y, x, 0), + p_out_thread); } } @@ -291,10 +291,10 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { - blockwise_batch_gemm.Run(p_wei_block_double + - wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_double + in_chwn_block_desc.Get1dIndex(0, y, x, 0), - p_out_thread); + blockwise_batch_gemm.Run( + p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_double + in_chwn_block_desc.Get1dIndex(0, y, x, 0), + p_out_thread); } } @@ -376,17 +376,8 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( Sequence{}); diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp index b32a19de87..0db7368071 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp @@ -43,8 +43,9 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn Float* const __restrict__ p_out_global) const { // be careful of this assertion - static_assert(NPerThread <= NPerBlock && NPerBlock % NPerThread == 0, - "wrong! should satisfy: NPerThread <= NPerBlock && NPerBlock % NPerThread == 0"); + static_assert( + NPerThread <= NPerBlock && NPerBlock % NPerThread == 0, + "wrong! should satisfy: NPerThread <= NPerBlock && NPerBlock % NPerThread == 0"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -69,8 +70,9 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn constexpr index_t WiPerBlock = WoPerBlock + X - 1; // divide block work: [K, Ho, Wo, N] - static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, - "wrong! cannot evenly divide work for workgroup "); + static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && + Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, + "wrong! cannot evenly divide work for workgroup "); constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock; @@ -93,7 +95,8 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn const index_t wi_block_data_begin = wo_block_data_begin; // 2d tensor view of gridwise weight - constexpr auto wei_ck_global_desc = make_ConstantTensorDescriptor(Sequence{}, Sequence{}); + constexpr auto wei_ck_global_desc = + make_ConstantTensorDescriptor(Sequence{}, Sequence{}); // tensor view of blockwise input and weight in LDS // be careful of alignment @@ -124,7 +127,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn // blockwise wei copy // format is [CPerBlock, KPerBlock] const auto blockwise_wei_copy = -#if 0//debug +#if 0 // debug Blockwise2dTensorCopy1{}; #endif - // a series of blockwise batched GEMM - // C_matrix += transpose(A_matrix) * B_matrix - // A_matrix and B_matrix saved in LDS, C_matrix saved in register - // A_matrix[C,K] is a sub-matrix of wei_block[C,K] - // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] - // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] - constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); + // a series of blockwise batched GEMM + // C_matrix += transpose(A_matrix) * B_matrix + // A_matrix and B_matrix saved in LDS, C_matrix saved in register + // A_matrix[C,K] is a sub-matrix of wei_block[C,K] + // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] + // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] + constexpr auto a_cxk_block_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, + Number{}, + Number{}); constexpr auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(Number{}, @@ -180,7 +185,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn GemmDataPerReadB>{}; // LDS: be careful of alignment - constexpr index_t in_block_space = in_chwn_block_desc.GetElementSpace(Number{}); + constexpr index_t in_block_space = in_chwn_block_desc.GetElementSpace(Number{}); constexpr index_t wei_block_space = wei_ck_block_desc.GetElementSpace(Number{}); __shared__ Float p_in_block[in_block_space]; @@ -227,8 +232,8 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn { // weight: global mem to LDS blockwise_wei_copy.Run(p_wei_global_block_offset + - wei_cyxk_global_desc.Get1dIndex(0, y, x, 0), - p_wei_block); + wei_cyxk_global_desc.Get1dIndex(0, y, x, 0), + p_wei_block); __syncthreads(); @@ -297,17 +302,8 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( + Sequence{}); constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( Sequence{}); diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp index fea45f30a9..34c9cbc430 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hip.hpp @@ -1,6 +1,11 @@ #pragma once -template +template __device__ void threadwise_matrix_copy(SrcMatrix, const Float* __restrict__ p_src, DstMatrix, @@ -22,7 +27,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix, const index_t src_index = src_mtx.Get1dIndex(i, j); const index_t dst_index = dst_mtx.Get1dIndex(i, j); - *reinterpret_cast(&p_dst[dst_index]) = + *reinterpret_cast(&p_dst[dst_index]) = *reinterpret_cast(&p_src[src_index]); } }