From 00899f191b1ffbe24bdaf982d8ebf6a4e29697c3 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 13 Apr 2019 11:19:17 -0500 Subject: [PATCH] implicit gemm v1r2: only load 1d filter --- ...icit_gemm_convolution_1_chwn_cyxk_khwn.hpp | 48 ++++- driver/driver.hip.cpp | 43 +++-- src/include/ConstantTensorDescriptor.hip.hpp | 32 ++++ src/include/blockwise_2d_tensor_op.hip.hpp | 101 +++++++++- src/include/blockwise_3d_tensor_op.hip.hpp | 103 ++++++++++ src/include/blockwise_4d_tensor_op.hip.hpp | 18 +- src/include/blockwise_batched_gemm.hip.hpp | 11 ++ .../blockwise_direct_convolution.hip.hpp | 4 +- src/include/blockwise_gemm.hip.hpp | 2 + ...implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp} | 14 +- ..._chwn_cyxk_khwn_lds_double_buffer.hip.hpp} | 2 +- ..._implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp | 176 +++++++++--------- ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp | 2 +- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 2 +- src/include/threadwise_2d_tensor_op.hip.hpp | 4 +- src/include/threadwise_4d_tensor_op.hip.hpp | 4 +- .../threadwise_direct_convolution.hip.hpp | 2 +- 17 files changed, 426 insertions(+), 142 deletions(-) create mode 100644 src/include/blockwise_3d_tensor_op.hip.hpp rename src/include/{gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp => gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp} (96%) rename src/include/{gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp => gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp} (99%) diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp index decf294ab4..8727f7a315 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp @@ -2,8 +2,8 @@ #include #include "device.hpp" #include "gridwise_convolution_wrapper.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" +#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp" +#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" #include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp" template @@ -78,7 +78,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, out_khwn_device_buf.ToDevice(out_khwn.mData.data()); #if 0 - // for 3x3, 34x34, Pascal + // for 3x3, 34x34, v1r1, Pascal constexpr index_t NPerBlock = 16; constexpr index_t KPerBlock = 64; constexpr index_t CPerBlock = 4; @@ -112,6 +112,40 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, constexpr index_t BlockSize = 128; #elif 1 + // for 3x3, 34x34, v1r2, Pascal + constexpr index_t NPerBlock = 4; + constexpr index_t KPerBlock = 64; + constexpr index_t CPerBlock = 8; + constexpr index_t HoPerBlock = 4; + constexpr index_t WoPerBlock = 8; + + constexpr index_t NPerThread = 4; + constexpr index_t KPerThread = 8; + constexpr index_t HoPerThread = 1; + constexpr index_t WoPerThread = 2; + + constexpr index_t InBlockCopy_ThreadPerDimC = 4; + constexpr index_t InBlockCopy_ThreadPerDimH = 4; + constexpr index_t InBlockCopy_ThreadPerDimW = 2; + constexpr index_t InBlockCopy_ThreadPerDimN = 1; + constexpr index_t InBlockCopyDataPerRead = 4; + + constexpr index_t WeiBlockCopyDataPerRead = 4; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 2; + constexpr index_t GemmMLevel1Cluster = 2; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + constexpr index_t OutThreadCopyDataPerWrite = 2; + + constexpr index_t BlockSize = 128; +#elif 0 // for 3x3, 34x34, Vega 20 constexpr index_t NPerBlock = 16; constexpr index_t KPerBlock = 128; @@ -406,12 +440,12 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, for(index_t i = 0; i < nrepeat; ++i) { constexpr auto gridwise_conv = -#if 1 - GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn +#if 0 + GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn +#elif 0 + GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn_lds_double_buffer #elif 1 GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn -#elif 0 - GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer #endif + double operator()(Is... is) + { + std::array dims = {{static_cast(is)...}}; + +#if 0 + auto f_acc = std::plus{}; +#else + auto f_acc = [](auto a, auto b){ return 10*a + b;}; +#endif + + return std::accumulate(dims.begin(), + dims.end(), + index_t(0), + f_acc); + } +}; + struct GeneratorTensor_Checkboard { template @@ -398,18 +421,7 @@ void check_error(const Tensor& ref, const Tensor& result) int main(int argc, char* argv[]) { -#if 0 - constexpr index_t N = 1; - constexpr index_t C = 1; - constexpr index_t HI = 28; - constexpr index_t WI = 28; - constexpr index_t K = 1; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 1 +#if 1 // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; @@ -656,13 +668,10 @@ int main(int argc, char* argv[]) #if 0 in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); -#elif 0 - in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #elif 1 in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); -#elif 0 +#elif 1 in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); auto gen_wei = [](auto... is) { @@ -681,7 +690,7 @@ int main(int argc, char* argv[]) device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 1 device_implicit_gemm_convolution_1_chwn_cyxk_khwn -#elif 1 +#elif 0 device_implicit_gemm_convolution_2_chwn_cyxk_khwn #endif (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index f4d95dfccb..c8b621d384 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -8,6 +8,13 @@ __host__ __device__ constexpr auto calculate_default_strides(Sequence) return Sequence{}; } +// this is ugly, only for 3d +template +__host__ __device__ constexpr auto calculate_default_strides(Sequence) +{ + return Sequence{}; +} + // this is ugly, only for 4d template __host__ __device__ constexpr auto calculate_default_strides(Sequence) @@ -79,6 +86,15 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence{}; } +// this is ugly, only for 3d +template +__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence, + Number) +{ + constexpr index_t L2_align = Align * ((L2 + Align - 1) / Align); + return Sequence{}; +} + // this is ugly, only for 4d template __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence, @@ -244,6 +260,22 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetStride(I0), desc.GetStride(I1)); } + else if(ndim == 3) + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + + printf("%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", + s, + desc.GetDimension(), + desc.GetLength(I0), + desc.GetLength(I1), + desc.GetLength(I2), + desc.GetStride(I0), + desc.GetStride(I1), + desc.GetStride(I2)); + } else if(ndim == 4) { constexpr auto I0 = Number<0>{}; diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 141cb2602f..6521ddc0a5 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -14,7 +14,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths()); #if 0 - if(threadIdx.x == 0) + if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: "); print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: "); @@ -25,7 +25,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst for(index_t iloop = 0; iloop < NLoop; ++iloop) { - index_t is = threadIdx.x + iloop * BlockSize; + index_t is = get_thread_local_1d_id() + iloop * BlockSize; const index_t did0 = is / desc.GetStride(I0); @@ -42,7 +42,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst if(has_tail) { - index_t is = threadIdx.x + NLoop * BlockSize; + index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < desc.GetElementSize()) { @@ -59,7 +59,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst } } -// Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3] +// Function: p_dst[reorder[i0], reorder[i1] = p_src[i0,i1] // TODO: in order to optimize mem access for different mem type, // need to write specialized version template +template struct Blockwise2dTensorCopy1 { + using vector_t = typename vector_type::MemoryType; + + __device__ constexpr Blockwise2dTensorCopy1() + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1), + "wrong! only support stride1 == 1 if DataPerRead > 1!\n"); + + static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, + "wrong! only support DataPerRead == 1, 2 or 4!\n"); + + static_assert(SrcDesc{}.GetStride(I0) % DataPerRead == 0 && + DstDesc{}.GetStride(I0) % DataPerRead == 0, + "src and dst stride2 should be multiple of DataPerRead to keep alignment"); + + // we allow out-of-bound read from src in D1 dimension, + // but we need to make sure dst stride0 is big enough, + // so that the out-of-bound write won't contaminate next line in dst + constexpr index_t L1 = CopyLengths{}.Get(I1); + constexpr index_t read_per_d1 = integer_divide_ceil(L1, DataPerRead); + + static_assert(read_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0), + "wrong! out-of-bound write will contaminate next line!\n"); + } + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - constexpr auto dst_from_src_reorder = Sequence<0, 1>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; - blockwise_2d_tensor_copy_reorder_by_get_dst_from_src( - SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder); + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr index_t L0 = CopyLengths{}.Get(I0); + constexpr index_t L1 = CopyLengths{}.Get(I1); + + constexpr index_t read_per_d1 = integer_divide_ceil(L1, DataPerRead); + + constexpr auto ref_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; + + auto f_copy = [&](index_t is) { + index_t did[4]; + + did[0] = is / ref_desc.GetStride(I0); + + is -= did[0] * ref_desc.GetStride(I0); + + did[1] = is / ref_desc.GetStride(I1); + + const index_t src_index = + src_desc.Get1dIndex(did[0], did[1] * DataPerRead); + const index_t dst_index = + dst_desc.Get1dIndex(did[0], did[1] * DataPerRead); + + *(reinterpret_cast(p_dst + dst_index)) = + *(reinterpret_cast(p_src + src_index)); + }; + + for(index_t iloop = 0; iloop < NLoop; ++iloop) + { + index_t is = get_thread_local_1d_id() + iloop * BlockSize; + + f_copy(is); + } + + constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); + + if(has_tail) + { + index_t is = get_thread_local_1d_id() + NLoop * BlockSize; + + if(is < ref_desc.GetElementSize()) + { + f_copy(is); + } + } } }; diff --git a/src/include/blockwise_3d_tensor_op.hip.hpp b/src/include/blockwise_3d_tensor_op.hip.hpp new file mode 100644 index 0000000000..c7a85431b2 --- /dev/null +++ b/src/include/blockwise_3d_tensor_op.hip.hpp @@ -0,0 +1,103 @@ +#pragma once +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" + +template +struct Blockwise3dTensorCopy1 +{ + using vector_t = typename vector_type::MemoryType; + + __device__ constexpr Blockwise3dTensorCopy1() + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I2) == 1 && DstDesc{}.GetStride(I2) == 1), + "wrong! only support stride2 == 1 if DataPerRead > 1!\n"); + + static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, + "wrong! only support DataPerRead == 1, 2 or 4!\n"); + + static_assert(SrcDesc{}.GetStride(I1) % DataPerRead == 0 && + DstDesc{}.GetStride(I1) % DataPerRead == 0, + "src and dst stride1 should be multiple of DataPerRead to keep alignment"); + + // we allow out-of-bound read from src in D3 dimension, + // but we need to make sure dst stride2 is big enough, + // so that the out-of-bound write won't contaminate next line in dst + constexpr index_t L2 = CopyLengths{}.Get(I2); + constexpr index_t read_per_d2 = integer_divide_ceil(L2, DataPerRead); + + static_assert(read_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1), + "wrong! out-of-bound write will contaminate next line!\n"); + } + + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr index_t L0 = CopyLengths{}.Get(I0); + constexpr index_t L1 = CopyLengths{}.Get(I1); + constexpr index_t L2 = CopyLengths{}.Get(I2); + + constexpr index_t read_per_d2 = integer_divide_ceil(L2, DataPerRead); + + constexpr auto ref_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; + + auto f_copy = [&](index_t is) { + index_t did[3]; + + did[0] = is / ref_desc.GetStride(I0); + + is -= did[0] * ref_desc.GetStride(I0); + + did[1] = is / ref_desc.GetStride(I1); + + is -= did[1] * ref_desc.GetStride(I1); + + did[2] = is / ref_desc.GetStride(I2); + + const index_t src_index = + src_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); + const index_t dst_index = + dst_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); + + *(reinterpret_cast(p_dst + dst_index)) = + *(reinterpret_cast(p_src + src_index)); + }; + + for(index_t iloop = 0; iloop < NLoop; ++iloop) + { + index_t is = get_thread_local_1d_id() + iloop * BlockSize; + + f_copy(is); + } + + constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); + + if(has_tail) + { + index_t is = get_thread_local_1d_id() + NLoop * BlockSize; + + if(is < ref_desc.GetElementSize()) + { + f_copy(is); + } + } + } +}; diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 444190e2a7..45de7823b5 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -15,7 +15,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths()); #if 0 - if(threadIdx.x == 0) + if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: "); print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: "); @@ -26,7 +26,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst for(index_t iloop = 0; iloop < NLoop; ++iloop) { - index_t is = threadIdx.x + iloop * BlockSize; + index_t is = get_thread_local_1d_id() + iloop * BlockSize; const index_t did0 = is / desc.GetStride(I0); @@ -51,7 +51,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst if(has_tail) { - index_t is = threadIdx.x + NLoop * BlockSize; + index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < desc.GetElementSize()) { @@ -113,7 +113,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds for(index_t iloop = 0; iloop < NLoop; ++iloop) { - index_t is = threadIdx.x + iloop * BlockSize; + index_t is = get_thread_local_1d_id() + iloop * BlockSize; index_t did[4]; @@ -142,7 +142,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds if(has_tail) { - index_t is = threadIdx.x + NLoop * BlockSize; + index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < ref_desc.GetElementSize()) { @@ -287,7 +287,7 @@ struct Blockwise4dTensorCopy1 for(index_t iloop = 0; iloop < NLoop; ++iloop) { - index_t is = threadIdx.x + iloop * BlockSize; + index_t is = get_thread_local_1d_id() + iloop * BlockSize; f_copy(is); } @@ -296,7 +296,7 @@ struct Blockwise4dTensorCopy1 if(has_tail) { - index_t is = threadIdx.x + NLoop * BlockSize; + index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < ref_desc.GetElementSize()) { @@ -370,7 +370,7 @@ struct BlockwiseChwnTensorCopyPadded for(index_t iloop = 0; iloop < NLoop; ++iloop) { - index_t is = threadIdx.x + iloop * BlockSize; + index_t is = get_thread_local_1d_id() + iloop * BlockSize; index_t did[4]; @@ -401,7 +401,7 @@ struct BlockwiseChwnTensorCopyPadded if(has_tail) { - index_t is = threadIdx.x + NLoop * BlockSize; + index_t is = get_thread_local_1d_id() + NLoop * BlockSize; if(is < ref_desc.GetElementSize()) { diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp index 6919325f98..bdaab2e90a 100644 --- a/src/include/blockwise_batched_gemm.hip.hpp +++ b/src/include/blockwise_batched_gemm.hip.hpp @@ -250,6 +250,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 } } +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + printf("a: %f %f %f %f %f %f %f %f, b: %f %f %f %f %f %f %f %f\n", + p_a_thread[0], p_a_thread[1], p_a_thread[2], p_a_thread[3], p_a_thread[4], p_a_thread[5], p_a_thread[6], p_a_thread[7], + p_b_thread[0], p_b_thread[1], p_b_thread[2], p_b_thread[3], p_b_thread[4], p_b_thread[5], p_b_thread[6], p_b_thread[7]); + } +#endif + threadwise_gemm(a_thread_mtx, True, p_a_thread, @@ -313,6 +322,8 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 MPerThread == 8 && NPerThread == 8, "Run_asm cannot deal with this GEMM shape yet\n"); + static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n"); + static_assert( BlockMatrixStrideA == 0 && BatchPerThread == 1, "Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n"); diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp index 3aff3b7936..c79833f17d 100644 --- a/src/include/blockwise_direct_convolution.hip.hpp +++ b/src/include/blockwise_direct_convolution.hip.hpp @@ -42,7 +42,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, constexpr index_t XThreadWork = (out_block_desc.GetLength(I3) + WoPerThread - 1) / WoPerThread; #if 0 - if(threadIdx.x == 0) + if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(in_block_desc); print_ConstantTensorDescriptor(wei_block_desc); @@ -68,7 +68,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, constexpr auto out_thread_block_desc = make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_block_desc.GetStrides()); - const index_t thread_id = threadIdx.x; + const index_t thread_id = get_thread_local_1d_id(); for(index_t thread_work_id = thread_id; thread_work_id < NThreadWork * KThreadWork * YThreadWork * XThreadWork; diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index ad4e9d2cdf..4870227f7e 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -176,6 +176,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 MPerThread == 8 && NPerThread == 8, "Run_asm cannot deal with this GEMM shape yet\n"); + static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n"); + using Float4 = vector_type::MemoryType; Float4* reg_a = (Float4*)(p_a_thread); diff --git a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp similarity index 96% rename from src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp rename to src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp index eab42ed07b..67f1bca25d 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp @@ -36,7 +36,7 @@ template -struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn +struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn { __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, @@ -100,7 +100,7 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn // tensor view of blockwise input and weight in LDS // be careful of alignment constexpr index_t max_align = - mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); + mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead, GemmDataPerReadA, GemmDataPerReadB); constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -118,6 +118,14 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn // blockwise copy // input: format is [C, Hi, Wi, N] const auto blockwise_in_copy = +#if 1 + Blockwise4dTensorCopy1{}; +#else Blockwise4dTensorCopy3{}; +#endif + // blockwise wei copy // format is [CPerBlock*Y*X,KPerBlock] diff --git a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp similarity index 99% rename from src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp rename to src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 5b42c638e1..365697ecfd 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -36,7 +36,7 @@ template -struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer +struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn_lds_double_buffer { __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp index 0db7368071..fa710b3845 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp @@ -2,8 +2,9 @@ #include "common.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp" -#include "blockwise_4d_tensor_op.hip.hpp" #include "blockwise_2d_tensor_op.hip.hpp" +#include "blockwise_3d_tensor_op.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp" #include "blockwise_batched_gemm.hip.hpp" @@ -52,19 +53,19 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_chwn_global_desc = InGlobalDesc{}; - constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{}; - constexpr auto out_khwn_global_desc = OutGlobalDesc{}; + constexpr auto in_c_h_w_n_global_desc = InGlobalDesc{}; + constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; + constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{}; - constexpr index_t C = in_chwn_global_desc.GetLength(I0); + constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0); - constexpr index_t K = out_khwn_global_desc.GetLength(I0); - constexpr index_t Ho = out_khwn_global_desc.GetLength(I1); - constexpr index_t Wo = out_khwn_global_desc.GetLength(I2); - constexpr index_t N = out_khwn_global_desc.GetLength(I3); + constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0); + constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1); + constexpr index_t Wo = out_k_h_w_n_global_desc.GetLength(I2); + constexpr index_t N = out_k_h_w_n_global_desc.GetLength(I3); - constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1); - constexpr index_t X = wei_cyxk_global_desc.GetLength(I2); + constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1); + constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2); constexpr index_t HiPerBlock = HoPerBlock + Y - 1; constexpr index_t WiPerBlock = WoPerBlock + X - 1; @@ -94,84 +95,85 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn const index_t hi_block_data_begin = ho_block_data_begin; const index_t wi_block_data_begin = wo_block_data_begin; - // 2d tensor view of gridwise weight - constexpr auto wei_ck_global_desc = - make_ConstantTensorDescriptor(Sequence{}, Sequence{}); + // global tensor view + constexpr auto wei_c_x_k_global_desc = + make_ConstantTensorDescriptor(Sequence{}, Sequence{}); - // tensor view of blockwise input and weight in LDS + // LDS tensor view // be careful of alignment constexpr index_t max_align = - mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); + mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead, GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); - constexpr auto wei_ck_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); + constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_khwn_thread_desc = make_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( Sequence{}); // blockwise copy // input: format is [C, Hi, Wi, N] const auto blockwise_in_copy = +#if 1 + Blockwise4dTensorCopy1{}; +#else Blockwise4dTensorCopy3{}; - - // blockwise wei copy - // format is [CPerBlock, KPerBlock] - const auto blockwise_wei_copy = -#if 0 // debug - Blockwise2dTensorCopy1{}; -#else - Blockwise2dTensorCopy3{}; #endif - // a series of blockwise batched GEMM - // C_matrix += transpose(A_matrix) * B_matrix - // A_matrix and B_matrix saved in LDS, C_matrix saved in register - // A_matrix[C,K] is a sub-matrix of wei_block[C,K] - // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] - // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] - constexpr auto a_cxk_block_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); + // blockwise wei copy + // format is [CPerBlock, X * KPerBlock] + const auto blockwise_wei_copy = + Blockwise3dTensorCopy1{}; - constexpr auto b_cxwn_block_mtx_desc = + // a series of blockwise batched GEMM + // C_matrix += transpose(A_matrix) * B_matrix + // A_matrix and B_matrix saved in LDS, C_matrix saved in register + // A_matrix[C,K] is a sub-matrix of wei_block[C,K] + // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] + // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] + constexpr auto a_c_k_block_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, + Number{}, + Number{}); + + constexpr auto b_c_wn_block_mtx_desc = make_ConstantMatrixDescriptor(Number{}, Number{}, - Number{}); + Number{}); - constexpr auto c_kxwn_thread_mtx_desc = + constexpr auto c_k_wn_thread_mtx_desc = make_ConstantMatrixDescriptor(Number{}, Number{}, - Number{}); + Number{}); const auto blockwise_batch_gemm = BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2< BlockSize, - decltype(a_cxk_block_mtx_desc), - decltype(b_cxwn_block_mtx_desc), - decltype(c_kxwn_thread_mtx_desc), + decltype(a_c_k_block_mtx_desc), + decltype(b_c_wn_block_mtx_desc), + decltype(c_k_wn_thread_mtx_desc), 0, - in_chwn_block_desc.GetStride(I1), - out_khwn_thread_desc.GetStride(I1), + in_c_h_w_n_block_desc.GetStride(I1), + out_k_h_w_n_thread_desc.GetStride(I1), HoPerBlock, GemmMPerThreadSubC, GemmNPerThreadSubC, @@ -185,64 +187,64 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn GemmDataPerReadB>{}; // LDS: be careful of alignment - constexpr index_t in_block_space = in_chwn_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_space = wei_ck_block_desc.GetElementSpace(Number{}); + constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(Number{}); + constexpr index_t wei_block_space = wei_c_x_k_block_desc.GetElementSpace(Number{}); __shared__ Float p_in_block[in_block_space]; __shared__ Float p_wei_block[wei_block_space]; // register - Float p_out_thread[out_khwn_thread_desc.GetElementSpace()]; + Float p_out_thread[out_k_h_w_n_thread_desc.GetElementSpace()]; #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) { - print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc"); - print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc"); - print_ConstantTensorDescriptor(wei_ck_global_desc, "wei_ck_global_desc"); + print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc"); + print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc"); - print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc"); - print_ConstantTensorDescriptor(wei_ck_block_desc, "wei_ck_block_desc"); + print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc"); + print_ConstantTensorDescriptor(wei_c_x_k_block_desc, "wei_c_x_k_block_desc"); printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space); } #endif // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread); + threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); const Float* p_in_global_block_offset = p_in_global + - in_chwn_global_desc.Get1dIndex( + in_c_h_w_n_global_desc.Get1dIndex( 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, - p_in_global_block_offset += CPerBlock * in_chwn_global_desc.GetStride(I0), - p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0)) + p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0), + p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) { - // input: global mem to LDS - blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); - for(index_t y = 0; y < Y; ++y) { + blockwise_in_copy.Run(p_in_global_block_offset + + in_c_h_w_n_global_desc.Get1dIndex(0, y, 0, 0), + p_in_block); + + blockwise_wei_copy.Run(p_wei_global_block_offset + + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0), + p_wei_block); + + __syncthreads(); + for(index_t x = 0; x < X; ++x) { - // weight: global mem to LDS - blockwise_wei_copy.Run(p_wei_global_block_offset + - wei_cyxk_global_desc.Get1dIndex(0, y, x, 0), - p_wei_block); - - __syncthreads(); - - blockwise_batch_gemm.Run(p_wei_block, - p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0), + blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0), + p_in_block + in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), p_out_thread); - __syncthreads(); } + + __syncthreads(); } } @@ -324,7 +326,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn p_out_thread, out_10d_global_desc, p_out_global + - out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, ho_block_data_begin + ho_thread_data_begin, wo_block_data_begin + wo_thread_data_begin, n_block_data_begin + n_thread_data_begin), diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index b301fc1e52..cbebe28f17 100644 --- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -125,7 +125,7 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i constexpr index_t HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread; constexpr index_t WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread; - const index_t thread_id = threadIdx.x; + const index_t thread_id = get_thread_local_1d_id(); itmp = thread_id; const index_t n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork); diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 250253f2ff..0b83eccc3c 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -137,7 +137,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr index_t HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread; constexpr index_t WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread; - const index_t thread_id = threadIdx.x; + const index_t thread_id = get_thread_local_1d_id(); itmp = thread_id; const index_t n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork); diff --git a/src/include/threadwise_2d_tensor_op.hip.hpp b/src/include/threadwise_2d_tensor_op.hip.hpp index 6e25b61b73..f8b8f722e3 100644 --- a/src/include/threadwise_2d_tensor_op.hip.hpp +++ b/src/include/threadwise_2d_tensor_op.hip.hpp @@ -10,7 +10,7 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re constexpr auto desc = Desc{}; #if 0 - if(threadIdx.x == 0) + if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: "); } @@ -112,7 +112,7 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi constexpr auto desc = Desc{}; #if 0 - if(threadIdx.x == 0) + if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_shift_down: "); } diff --git a/src/include/threadwise_4d_tensor_op.hip.hpp b/src/include/threadwise_4d_tensor_op.hip.hpp index 19ab68d013..21fed4d286 100644 --- a/src/include/threadwise_4d_tensor_op.hip.hpp +++ b/src/include/threadwise_4d_tensor_op.hip.hpp @@ -12,7 +12,7 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re constexpr auto desc = Desc{}; #if 0 - if(threadIdx.x == 0) + if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: "); } @@ -218,7 +218,7 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi constexpr auto desc = Desc{}; #if 0 - if(threadIdx.x == 0) + if(get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_shift_down: "); } diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp index 9b90c402a1..1c033573b9 100644 --- a/src/include/threadwise_direct_convolution.hip.hpp +++ b/src/include/threadwise_direct_convolution.hip.hpp @@ -20,7 +20,7 @@ __device__ void threadwise_direct_convolution_1(InDesc, constexpr auto out_desc = OutDesc{}; #if 0 - if(blockIdx.x == 0 && threadIdx.x == 0) + if(blockIdx.x == 0 && get_thread_local_1d_id() == 0) { print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution: in_desc: "); print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution: wei_desc: ");