From 79d9b1084b8f65fe6c261483276b791aeb918627 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Mar 2019 18:16:02 -0500 Subject: [PATCH] adding fp16 direct that reads pre-vectorized data --- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 49 +++++++--- src/include/blockwise_2d_tensor_op.hip.hpp | 2 +- src/include/blockwise_4d_tensor_op.hip.hpp | 4 +- src/include/common.hip.hpp | 92 +------------------ src/include/config.h.in | 2 - src/include/functional.hip.hpp | 8 ++ ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 46 ++++++---- .../threadwise_direct_convolution.hip.hpp | 55 ++++------- 8 files changed, 92 insertions(+), 166 deletions(-) diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index 19633f0462..d16d05d978 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -13,8 +13,8 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, unsigned nrepeat) { constexpr unsigned NVector = 1; - using vector_type_t = vector_type; - using vector_t = typename vector_type_t::VectorType; + using vector_t = vector_type; + using vector_mem_t = typename vector_t::MemoryType; constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -41,40 +41,41 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: "); - Tensor in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); + Tensor in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) { #if 1 in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w); #else in_nchw_vec(n, c, h, w) = - vector_type_t::pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); + vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); #endif }; - make_ParallelTensorFunctor(f_vectorized_nchw, N, C, Hi, Wi)( + make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)( std::thread::hardware_concurrency()); // vectorize weight auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: "); - Tensor wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); + Tensor wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) { #if 1 wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x); #else wei_kcyx_vec(k, c, y, x) = - vector_type_t::pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); + vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); #endif }; - make_ParallelTensorFunctor(f_vectorized_kcyx, K, C, Y, X)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)( + std::thread::hardware_concurrency()); // - DeviceMem in_nchw_vec_device_buf(sizeof(vector_t) * in_nchw_vec.mDesc.GetElementSpace()); - DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_t) * wei_kcyx_vec.mDesc.GetElementSpace()); + DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace()); DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace()); in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data()); @@ -82,7 +83,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); #if 1 - // 3x3, 34x34, 128 thread + // 3x3, 34x34, 128 thread, fp32, vector = 1 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 32; constexpr unsigned CPerBlock = 4; @@ -96,24 +97,42 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr unsigned WoPerThread = 2; constexpr unsigned InBlockCopyDataPerRead = 2; - constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned BlockSize = 128; #elif 1 - // 3x3, 34x34, 128 thread, fp16 + // 3x3, 34x34, 128 thread, fp32, vector = 2 constexpr unsigned NPerBlock = 2; constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; + constexpr unsigned CPerBlock = 2; constexpr unsigned HoPerBlock = 2; constexpr unsigned WoPerBlock = 32; constexpr unsigned NPerThread = 2; constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; + constexpr unsigned CPerThread = 1; constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 2; + + constexpr unsigned BlockSize = 128; +#elif 1 + // 3x3, 34x34, 128 thread, fp16 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 4; constexpr unsigned BlockSize = 128; diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 969f18c4e8..ce3a7a37b9 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -373,7 +373,7 @@ template struct Blockwise2dTensorCopy3 { - using vector_t = typename vector_type::VectorType; + using vector_t = typename vector_type::MemoryType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index e8829c0cbf..fa5f36be51 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -207,7 +207,7 @@ template struct Blockwise4dTensorCopy1 { - using vector_t = typename vector_type::VectorType; + using vector_t = typename vector_type::MemoryType; __device__ constexpr Blockwise4dTensorCopy1() { @@ -444,7 +444,7 @@ template struct Blockwise4dTensorCopy3 { - using vector_t = typename vector_type::VectorType; + using vector_t = typename vector_type::MemoryType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index aa7e2269f6..d5832dde9d 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -1,4 +1,5 @@ #pragma once +#include "data_type.hip.hpp" #include "constant_integral.hip.hpp" #include "Sequence.hip.hpp" #include "Array.hip.hpp" @@ -20,97 +21,6 @@ struct is_same static const bool value = true; }; -template -struct vector_type -{ -}; - -template <> -struct vector_type -{ - using VectorType = float; -}; - -template <> -struct vector_type -{ - using VectorType = float2; -}; - -template <> -struct vector_type -{ - using VectorType = float4; -}; - -#if 0 -template <> -struct vector_type -{ - using VectorType = half_float::half; -}; - -template <> -struct vector_type -{ - using VectorType = float; -}; - -template <> -struct vector_type -{ - using VectorType = float2; -}; - -template <> -struct vector_type -{ - using VectorType = float4; -}; -#endif - -#if 1 -template <> -struct vector_type -{ - using VectorType = half; - - __host__ __device__ static VectorType pack(half s) { return s; } -}; - -template <> -struct vector_type -{ - using VectorType = half2; - - union Data - { - VectorType vector; - half scalar[2]; - }; - - __host__ __device__ static VectorType pack(half s0, half s1) - { - Data data; - data.scalar[0] = s0; - data.scalar[1] = s1; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using VectorType = float2; -}; - -template <> -struct vector_type -{ - using VectorType = float4; -}; -#endif - template __host__ __device__ constexpr T max(T a, T b) { diff --git a/src/include/config.h.in b/src/include/config.h.in index 9ee0c41f80..7b888c6951 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -4,10 +4,8 @@ #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" -#include "half.hpp" #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" #include "nvToolsExt.h" #include "helper_cuda.h" -#include "cuda_fp16.h" #endif diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp index 598d5c3c71..d3f645eaae 100644 --- a/src/include/functional.hip.hpp +++ b/src/include/functional.hip.hpp @@ -47,3 +47,11 @@ struct static_const_reduce_n<1> return f(Number<0>{}); } }; + +#if 0 +template +__host__ __device__ constexpr auto unpacker(F f) +{ + return [=](auto xs_array){ f(xs...); }; +} +#endif \ No newline at end of file diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 5901c42e55..825977ab54 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -27,12 +27,14 @@ template __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( - const typename vector_type::VectorType* const __restrict__ p_in_vec_global, - const typename vector_type::VectorType* const __restrict__ p_wei_vec_global, + const typename vector_type::MemoryType* const __restrict__ p_in_vec_global, + const typename vector_type::MemoryType* const __restrict__ p_wei_vec_global, Float* const __restrict__ p_out_global) { - using scalar_t = Float; - using vector_t = typename vector_type::VectorType; + using scalar_t = Float; + using vector_mem_t = typename vector_type::MemoryType; constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -69,6 +71,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( // shared mem constexpr unsigned in_block_size = in_nchw_vec_block_desc.GetElementSpace(Number{}); + constexpr unsigned wei_block_size = wei_kcyx_vec_block_desc.GetElementSpace(Number{}); @@ -76,8 +79,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ? InBlockCopyDataPerRead : WeiBlockCopyDataPerRead; - __shared__ vector_t p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; - __shared__ vector_t p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; + __shared__ vector_mem_t + p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ vector_mem_t + p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; // threadwise tensors constexpr unsigned HiPerThread = HoPerThread + Y - 1; @@ -150,7 +155,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1 +template __device__ void threadwise_direct_convolution_1(InDesc, - Float* const __restrict__ p_in, + TInWei* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + TInWei* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + TOut* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -51,25 +51,10 @@ __device__ void threadwise_direct_convolution_1(InDesc, const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo); - p_out[out_index] += p_wei[wei_index] * p_in[in_index]; - -#if 0 - // if(threadIdx.x == 0) - { - printf("threadwise_direct_convolution: \t" - "threadIdx.x %u\t" - "out_index %u, p_out[out_index] %f, \t" - "wei_index %u, p_wei[wei_index] %f, \t" - "in_index %u, p_in[in_index] %f\n", - threadIdx.x, - out_index, - p_out[out_index], - wei_index, - p_wei[wei_index], - in_index, - p_in[in_index]); - } -#endif + fused_multiply_add(p_out[out_index], + p_wei[wei_index], + p_in[in_index], + p_out[out_index]); } } } @@ -81,13 +66,13 @@ __device__ void threadwise_direct_convolution_1(InDesc, // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register // Copy in and wei into register before doing convolution -template +template __device__ void threadwise_direct_convolution_2(InDesc, - Float* const __restrict__ p_in, + TInWei* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + TInWei* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + TOut* __restrict__ p_out) { constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; @@ -97,8 +82,8 @@ __device__ void threadwise_direct_convolution_2(InDesc, constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths()); // register - Float p_in_reg[in_reg_desc.GetElementSpace()]; - Float p_wei_reg[wei_reg_desc.GetElementSpace()]; + TInWei p_in_reg[in_reg_desc.GetElementSpace()]; + TInWei p_wei_reg[wei_reg_desc.GetElementSpace()]; // copy input tensor into register threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths()); @@ -114,13 +99,13 @@ __device__ void threadwise_direct_convolution_2(InDesc, // optimized for scenario where p_in and p_wei are in LDS, p_out is in register // break down a non-1x1 convolution into a sequence of 1x1 convolutions, // load 1x1 weight into register, and do 1x1 convolution in register. -template +template __device__ void threadwise_direct_convolution_3(InDesc, - Float* const __restrict__ p_in, + Data* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + Data* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + Data* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -139,8 +124,8 @@ __device__ void threadwise_direct_convolution_3(InDesc, constexpr auto wei_reg_desc = make_ConstantTensorDescriptor( Sequence{}); - Float p_in_reg[in_reg_desc.GetElementSpace()]; - Float p_wei_reg[wei_reg_desc.GetElementSpace()]; + Data p_in_reg[in_reg_desc.GetElementSpace()]; + Data p_wei_reg[wei_reg_desc.GetElementSpace()]; constexpr unsigned in_w_new_read = 1;