diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index b25459c30d..d8bb3b768e 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -51,7 +51,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); #elif 1 - in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), + in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), in_nchw(n, 4 * c + 1, h, w), in_nchw(n, 4 * c + 2, h, w), in_nchw(n, 4 * c + 3, h, w)); @@ -113,37 +113,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr unsigned BlockSize = 128; #elif 0 // 3x3, 34x34, 128 thread, fp32, vector = 2 - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 2; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 2; constexpr unsigned HoPerBlock = 2; constexpr unsigned WoPerBlock = 32; - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 1; + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 1; constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; - constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned BlockSize = 128; #elif 0 // 3x3, 34x34, 128 thread, int8, vector = 4 - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 8; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 8; constexpr unsigned HoPerBlock = 4; constexpr unsigned WoPerBlock = 32; - constexpr unsigned NPerThread = 1; - constexpr unsigned KPerThread = 8; - constexpr unsigned CPerThread = 2; + constexpr unsigned NPerThread = 1; + constexpr unsigned KPerThread = 8; + constexpr unsigned CPerThread = 2; constexpr unsigned HoPerThread = 4; constexpr unsigned WoPerThread = 2; - constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned BlockSize = 128; diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp index 2c27080670..99a6eb45c1 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp @@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_khwn_device_buf.ToDevice(out_khwn.mData.data()); -#if 1 +#if 0 // for 3x3, 34x34 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 64; @@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // for 1x1, 28x28 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 128; @@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, constexpr unsigned OutThreadCopyDataPerWrite = 2; + constexpr unsigned BlockSize = 128; +#elif 1 + // for 1x1, 14x14 + constexpr unsigned NPerBlock = 16; + constexpr unsigned KPerBlock = 128; + constexpr unsigned CPerBlock = 8; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 2; + + constexpr unsigned NPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 1; + constexpr unsigned WoPerThread = 1; + + constexpr unsigned InBlockCopy_ThreadPerDimC = 8; + constexpr unsigned InBlockCopy_ThreadPerDimH = 2; + constexpr unsigned InBlockCopy_ThreadPerDimW = 2; + constexpr unsigned InBlockCopy_ThreadPerDimN = 4; + constexpr unsigned InBlockCopyDataPerRead = 4; + + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned GemmMPerThreadSubC = 4; + constexpr unsigned GemmNPerThreadSubC = 4; + constexpr unsigned GemmMLevel0Cluster = 4; + constexpr unsigned GemmNLevel0Cluster = 2; + constexpr unsigned GemmMLevel1Cluster = 2; + constexpr unsigned GemmNLevel1Cluster = 4; + constexpr unsigned GemmKPerThreadLoop = 1; + + constexpr unsigned OutThreadCopyDataPerWrite = 2; + constexpr unsigned BlockSize = 128; #endif diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index b1df58265e..d391ec5b5f 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -8,11 +8,11 @@ #include "ConstantTensorDescriptor.hip.hpp" #include "conv_common.hip.hpp" //#include "device_direct_convolution_1.hpp" -#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" -#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" -//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" +//#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" +//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" +#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" -//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" +#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" struct GeneratorTensor_1 { @@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(const Tensor& in_nchw, std::size_t ho = HoPerTile * htile + j; for(int i = 0; i < WoPerTile; ++i) { - std::size_t wo = WoPerTile * wtile + i; + std::size_t wo = WoPerTile * wtile + i; out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); } } @@ -568,7 +568,7 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 2; constexpr unsigned WPad = 2; -#elif 1 +#elif 0 // 1x1 filter, 32x32 image constexpr unsigned N = 64; constexpr unsigned C = 256; @@ -578,6 +578,18 @@ int main(int argc, char* argv[]) constexpr unsigned Y = 1; constexpr unsigned X = 1; + constexpr unsigned HPad = 0; + constexpr unsigned WPad = 0; +#elif 1 + // 1x1 filter, 14x14 image + constexpr unsigned N = 128; + constexpr unsigned C = 2048; + constexpr unsigned HI = 14; + constexpr unsigned WI = 14; + constexpr unsigned K = 512; + constexpr unsigned Y = 1; + constexpr unsigned X = 1; + constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; #endif @@ -594,8 +606,8 @@ int main(int argc, char* argv[]) ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); - using in_data_t = char; - using out_data_t = int32_t; + using in_data_t = float; + using out_data_t = float; Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); @@ -635,9 +647,9 @@ int main(int argc, char* argv[]) device_direct_convolution_1 #elif 0 device_direct_convolution_2_nchw_kcyx_nkhw -#elif 1 - device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 0 + device_direct_convolution_2_vectorized_nchw_kcyx_nkhw +#elif 1 device_implicit_gemm_convolution_1_chwn_cyxk_khwn #elif 0 device_implicit_gemm_convolution_2_chwn_cyxk_khwn diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 1caab6a4c9..89654cbc2b 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -10,7 +10,7 @@ struct Array unsigned mData[nSize]; template - __host__ __device__ Array(Xs... xs) : mData({static_cast(xs)...}) + __host__ __device__ Array(Xs... xs) : mData{static_cast(xs)...} { } diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index fa5f36be51..0660c34ebb 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -340,10 +340,11 @@ struct BlockwiseChwnTensorCopyPadded constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; const Float* p_src_tmp = - p_src + src_desc.Get1dIndex(c_block_data_begin, - (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, - (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, - n_block_data_begin); + p_src + + src_desc.Get1dIndex(c_block_data_begin, + (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, + (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, + n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp index 247ff219f6..7666607c9c 100644 --- a/src/include/blockwise_direct_convolution.hip.hpp +++ b/src/include/blockwise_direct_convolution.hip.hpp @@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, Float p_out_thread[out_thread_desc.GetElementSpace()]; threadwise_4d_tensor_copy(out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc, p_out_thread, out_thread_desc.GetLengths()); @@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, // threadwise convolution threadwise_direct_convolution_2( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data_begin, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data_begin, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), @@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, threadwise_4d_tensor_copy(out_thread_desc, p_out_thread, out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc.GetLengths()); } } diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 33556dde25..221a7153a2 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 constexpr unsigned MRepeat = MPerThread / MPerThreadSubC; constexpr unsigned NRepeat = NPerThread / NPerThreadSubC; - // loop over k +// loop over k #pragma unroll for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) { - // read first batch of A, B - // copy A-sub to form A +// read first batch of A, B +// copy A-sub to form A #pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { @@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 a_thread_sub_mtx.GetLengths()); } - // copy B-sub to form B +// copy B-sub to form B #pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { @@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 b_thread_sub_mtx.GetLengths()); } - // loop over batch +// loop over batch #pragma unroll for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) { @@ -551,14 +551,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 c_thread_mtx_begin.batch * BlockMatrixStrideC + c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col); - for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat) + for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { - for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat) + for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { threadwise_matrix_copy( c_thread_sub_mtx, - p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, - n_repeat * NPerLevel1Cluster), + p_c_thread + + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, + n_repeat * NPerLevel1Cluster), c_block_mtx, p_c_block + c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, @@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC constexpr unsigned NClusterWork = (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster); - static_assert(BlockSize == (MClusterWork * MThreadPerCluster) * - (NClusterWork * NThreadPerCluster), + static_assert(BlockSize == + (MClusterWork * MThreadPerCluster) * + (NClusterWork * NThreadPerCluster), "wrong! wrong BlockSize"); if(DistributeThreadAlongColumnFirst) @@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), c_thread_sub_mtx, False, - p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, - n_repeat * NPerThreadSubC), + p_c_thread + + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, + n_repeat * NPerThreadSubC), f_accum); } } diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index d5832dde9d..ba0a521fb3 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -21,6 +21,7 @@ struct is_same static const bool value = true; }; +#if 0 template __host__ __device__ constexpr T max(T a, T b) { @@ -32,6 +33,7 @@ __host__ __device__ constexpr T min(T a, T b) { return a < b ? a : b; } +#endif __host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b) { diff --git a/src/include/config.h.in b/src/include/config.h.in index 7b888c6951..bb4f6cb51d 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -4,8 +4,10 @@ #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" +#include "hip/hip_fp16.h" #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" +#include "cuda_fp16.h" #include "nvToolsExt.h" #include "helper_cuda.h" #endif diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index ca1f4dcbae..95d5b0b33f 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -1,15 +1,6 @@ #pragma once #include "config.h" -#if DEVICE_BACKEND_CUDA -namespace CUDA { -#include "cuda_fp16.h" -} -#endif - -using half = CUDA::half; -using half2 = CUDA::half2; - template struct vector_type { @@ -52,6 +43,7 @@ struct vector_type using MemoryType = float4; }; +#if 0 template <> struct vector_type { @@ -91,24 +83,6 @@ struct vector_type using MemoryType = float4; }; -template <> -struct vector_type -{ - using MemoryType = half2; -}; - -template <> -struct vector_type -{ - using MemoryType = float2; -}; - -template <> -struct vector_type -{ - using MemoryType = float4; -}; - template <> struct vector_type { @@ -169,7 +143,6 @@ struct vector_type using MemoryType = int64_t; }; -#if 0 template <> struct vector_type { @@ -214,6 +187,7 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa d += s0.w * s1.w; } +#if 0 __device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) @@ -222,12 +196,10 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& d += s0.y * s1.y; } -#if 0 __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) { d += s0.x * s1.x + s0.y * s1.y; } -#endif __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; } @@ -239,3 +211,4 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i d = __dp4a(s0, s1, d); #endif } +#endif diff --git a/src/include/gridwise_direct_convolution_1.hip.hpp b/src/include/gridwise_direct_convolution_1.hip.hpp index f4fe1809fc..edcfd6d38e 100644 --- a/src/include/gridwise_direct_convolution_1.hip.hpp +++ b/src/include/gridwise_direct_convolution_1.hip.hpp @@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ c_block_work_begin += CPerBlock) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, - c_block_work_begin, - hi_block_work_begin, - wi_block_work_begin), + blockwise_in_copy.Run(p_in_global + + in_global_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), p_in_block); // copy weight tensor to LDS @@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ } // copy output tensor from LDS to device mem - blockwise_out_copy.Run(p_out_block, - p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, - k_block_work_begin, - ho_block_work_begin, - wo_block_work_begin)); + blockwise_out_copy.Run( + p_out_block, + p_out_global + + out_global_desc.Get1dIndex( + n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); } diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index 5761a22c16..1e6d3d24bd 100644 --- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -176,16 +176,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( - k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); + blockwise_wei_copy.Run( + p_wei_global + + wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); __syncthreads(); @@ -195,10 +197,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i #if 1 threadwise_direct_convolution_2( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -207,10 +210,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i #elif 0 threadwise_direct_convolution_3( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -225,9 +229,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 7ae594788b..4d72368b29 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -200,9 +200,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( p_in_vec_block); // copy weight tensor to LDS - blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex( - k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_vec_block); + blockwise_wei_copy.Run( + p_wei_vec_global + + wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_vec_block); __syncthreads(); @@ -212,10 +213,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #if 1 threadwise_direct_convolution_2( in_nchw_vec_thread_block_desc, - p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_vec_block + + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -224,10 +226,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #elif 0 threadwise_direct_convolution_3( in_nchw_vec_thread_block_desc, - p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_vec_block + + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -242,9 +245,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp index 1caef669e9..99342d3ca1 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp @@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread); const Float* p_in_global_block_begin = - p_in_global + in_chwn_global_desc.Get1dIndex( - 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); + p_in_global + + in_chwn_global_desc.Get1dIndex( + 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_begin = p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric } } - // output: register to global mem, +// output: register to global mem, #if 0 const auto c_thread_mtx_begin = blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric } #endif - threadwise_8d_tensor_copy(out_8d_thread_desc, - p_out_thread, - out_8d_global_desc, - p_out_global + out_khwn_global_desc.Get1dIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_8d_thread_desc.GetLengths(), - Number{}); + threadwise_8d_tensor_copy( + out_8d_thread_desc, + p_out_thread, + out_8d_global_desc, + p_out_global + + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_8d_thread_desc.GetLengths(), + Number{}); } else if(NPerThread == NPerBlock) { diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp index a4904cdf58..790a006023 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp @@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( out_hkwn_thread_desc, p_out_thread, out_khwn_global_desc, - p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), + p_out_global + + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_khwn_from_hkwn); } diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp index afa3d3ee90..f68b57b6a0 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp @@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric decltype(in_cb_block_desc), decltype(in_cb_block_desc.GetLengths())>{}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3) { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::MemoryType; static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 && SrcOpLengths::nDim == 6, @@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc, SrcOpLengths, Number) { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::MemoryType; static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 && SrcOpLengths::nDim == 8,