diff --git a/driver/device_direct_convolution_2.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp similarity index 57% rename from driver/device_direct_convolution_2.hpp rename to driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp index 1baedafc46..4a49ff9fc8 100644 --- a/driver/device_direct_convolution_2.hpp +++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp @@ -1,16 +1,16 @@ #pragma once #include #include "device.hpp" -#include "gridwise_direct_convolution_2.hip.hpp" +#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" template -void device_direct_convolution_2(InDesc, - const Tensor& in, - WeiDesc, - const Tensor& wei, - OutDesc, - Tensor& out, - unsigned nrepeat) +void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, + const Tensor& in, + WeiDesc, + const Tensor& wei, + OutDesc, + Tensor& out, + unsigned nrepeat) { std::size_t data_sz = sizeof(T); DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); @@ -46,6 +46,27 @@ void device_direct_convolution_2(InDesc, constexpr unsigned HoPerThread = 2; constexpr unsigned WoPerThread = 2; + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned BlockSize = 128; +#elif 1 + // 3x3, 34x34, 128 thread, fp16 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 128; #endif @@ -57,27 +78,30 @@ void device_direct_convolution_2(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - float time = launch_kernel(gridwise_direct_convolution_2, - dim3(GridSize), - dim3(BlockSize), - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer())); + float time = + launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp new file mode 100644 index 0000000000..1f3cee0482 --- /dev/null +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -0,0 +1,211 @@ +#pragma once +#include +#include "device.hpp" +#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" + +template +void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + unsigned nrepeat) +{ + // this suppose in / wei data type is int8x4 + constexpr unsigned NVector = 4; + using accum_t = int32_t; + using vector_t = vector_type; + using vector_mem_t = typename vector_t::MemoryType; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_nchw_desc = InDesc{}; + constexpr auto wei_kcyx_desc = WeiDesc{}; + constexpr auto out_nkhw_desc = OutDesc{}; + + constexpr unsigned Hi = in_nchw_desc.GetLength(I2); + constexpr unsigned Wi = in_nchw_desc.GetLength(I3); + + constexpr unsigned N = out_nkhw_desc.GetLength(I0); + constexpr unsigned Ho = out_nkhw_desc.GetLength(I2); + constexpr unsigned Wo = out_nkhw_desc.GetLength(I3); + + constexpr unsigned K = wei_kcyx_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_desc.GetLength(I3); + + // vectorized input + auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: "); + + Tensor in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); + + auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) { +#if 0 + in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w); +#elif 0 + in_nchw_vec(n, c, h, w) = + vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); +#elif 1 + in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), + in_nchw(n, 4 * c + 1, h, w), + in_nchw(n, 4 * c + 2, h, w), + in_nchw(n, 4 * c + 3, h, w)); +#endif + }; + + make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)( + std::thread::hardware_concurrency()); + + // vectorize weight + auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: "); + + Tensor wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); + + auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) { +#if 0 + wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x); +#elif 0 + wei_kcyx_vec(k, c, y, x) = + vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); +#elif 1 + wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x), + wei_kcyx(k, 4 * c + 1, y, x), + wei_kcyx(k, 4 * c + 2, y, x), + wei_kcyx(k, 4 * c + 3, y, x)); +#endif + }; + + make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)( + std::thread::hardware_concurrency()); + + // + DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace()); + + in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data()); + wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data()); + out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + +#if 0 + // 3x3, 34x34, 128 thread, fp32, vector = 1 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 2; + + constexpr unsigned BlockSize = 128; +#elif 0 + // 3x3, 34x34, 128 thread, fp32, vector = 2 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 2; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 2; + + constexpr unsigned BlockSize = 128; +#elif 0 + // 3x3, 34x34, 128 thread, int8, vector = 4 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 8; + constexpr unsigned HoPerBlock = 4; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 1; + constexpr unsigned KPerThread = 8; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 4; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 2; + + constexpr unsigned BlockSize = 128; +#elif 1 + // 1x1, 32x32, 128 thread, int8, vector = 4 + constexpr unsigned NPerBlock = 1; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 16; + constexpr unsigned HoPerBlock = 4; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 1; + constexpr unsigned KPerThread = 8; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 4; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 2; + + constexpr unsigned BlockSize = 128; +#endif + + constexpr unsigned GridSize = + (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + for(unsigned i = 0; i < nrepeat; ++i) + { + float time = launch_kernel( + gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_nchw_vec_device_buf.GetDeviceBuffer()), + static_cast(wei_kcyx_vec_device_buf.GetDeviceBuffer()), + static_cast(out_nkhw_device_buf.GetDeviceBuffer())); + + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); + } + + out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); +} diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp index 2c27080670..99a6eb45c1 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp @@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_khwn_device_buf.ToDevice(out_khwn.mData.data()); -#if 1 +#if 0 // for 3x3, 34x34 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 64; @@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // for 1x1, 28x28 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 128; @@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, constexpr unsigned OutThreadCopyDataPerWrite = 2; + constexpr unsigned BlockSize = 128; +#elif 1 + // for 1x1, 14x14 + constexpr unsigned NPerBlock = 16; + constexpr unsigned KPerBlock = 128; + constexpr unsigned CPerBlock = 8; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 2; + + constexpr unsigned NPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 1; + constexpr unsigned WoPerThread = 1; + + constexpr unsigned InBlockCopy_ThreadPerDimC = 8; + constexpr unsigned InBlockCopy_ThreadPerDimH = 2; + constexpr unsigned InBlockCopy_ThreadPerDimW = 2; + constexpr unsigned InBlockCopy_ThreadPerDimN = 4; + constexpr unsigned InBlockCopyDataPerRead = 4; + + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned GemmMPerThreadSubC = 4; + constexpr unsigned GemmNPerThreadSubC = 4; + constexpr unsigned GemmMLevel0Cluster = 4; + constexpr unsigned GemmNLevel0Cluster = 2; + constexpr unsigned GemmMLevel1Cluster = 2; + constexpr unsigned GemmNLevel1Cluster = 4; + constexpr unsigned GemmKPerThreadLoop = 1; + + constexpr unsigned OutThreadCopyDataPerWrite = 2; + constexpr unsigned BlockSize = 128; #endif diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index a657949f35..3edd8253dd 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -1,6 +1,7 @@ #pragma once #include #include "device.hpp" +#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp" #include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" template @@ -209,39 +210,43 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - float time = - launch_kernel(gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer< - GridSize, - BlockSize, - T, - decltype(in_chwn_desc), - decltype(wei_cyxk_desc), - decltype(out_khwn_desc), - BPerBlock, - KPerBlock, - CPerBlock, - BPerThread, - KPerThread, - GemmThreadPerColumnPerCluster, - GemmThreadPerRowPerCluster, - GemmMPerThreadSubC, - GemmNPerThreadSubC, - GemmMLevel0Cluster, - GemmNLevel0Cluster, - GemmMLevel1Cluster, - GemmNLevel1Cluster, - GemmKPerThreadLoop, - InBlockCopyThreadPerDim0, - InBlockCopyThreadPerDim1, - WeiBlockCopyThreadPerDim0, - WeiBlockCopyThreadPerDim1, - InBlockCopyDataPerRead, - WeiBlockCopyDataPerRead>, - dim3(GridSize), - dim3(BlockSize), - static_cast(in_chwn_device_buf.GetDeviceBuffer()), - static_cast(wei_cyxk_device_buf.GetDeviceBuffer()), - static_cast(out_khwn_device_buf.GetDeviceBuffer())); + float time = launch_kernel( +#if 0 + gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn +#else + gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer +#endif + , + dim3(GridSize), + dim3(BlockSize), + static_cast(in_chwn_device_buf.GetDeviceBuffer()), + static_cast(wei_cyxk_device_buf.GetDeviceBuffer()), + static_cast(out_khwn_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 325da5d1dc..5f2e9d7c95 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -7,10 +7,11 @@ #include "tensor.hpp" #include "ConstantTensorDescriptor.hip.hpp" #include "conv_common.hip.hpp" -#include "device_direct_convolution_1.hpp" -#include "device_direct_convolution_2.hpp" +//#include "device_direct_convolution_1.hpp" +#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" +//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" #include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" -#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" +//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" #include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" struct GeneratorTensor_1 @@ -34,25 +35,6 @@ struct GeneratorTensor_2 } }; -struct GeneratorTensor_3 -{ - template - double operator()(Is... is) - { -#if 0 - std::initializer_list ls = {static_cast(is)...}; - return std::accumulate(ls.begin(), ls.end(), std::size_t(0)); -#elif 1 - assert(sizeof...(Is) > 0); - std::initializer_list ids = {static_cast(is)...}; - std::vector lens(sizeof...(Is), 100); - std::vector strides(sizeof...(Is), 1); - std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1); - return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1; -#endif - } -}; - struct GeneratorTensor_Checkboard { template @@ -106,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc) return TensorDescriptor(lengths, strides); } -template -void host_direct_convolution( - const Tensor& in_nchw, const Tensor& wei_kcyx, Tensor& out, LowerPads, UpperPads) +template +void host_direct_convolution(const Tensor& in_nchw, + const Tensor& wei_kcyx, + Tensor& out_nkhw, + LowerPads, + UpperPads) { unsigned h_pad_low = LowerPads{}.Get(Number<0>{}); unsigned w_pad_low = LowerPads{}.Get(Number<1>{}); @@ -129,26 +114,29 @@ void host_direct_convolution( if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 && wi < in_nchw.mDesc.GetLengths()[3]) { - v += in_nchw(n, c, hi, wi) * wei_kcyx(k, c, y, x); + v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x)); } } } } - out(n, k, ho, wo) = v; + out_nkhw(n, k, ho, wo) = v; }; auto f_par = make_ParallelTensorFunctor(f, - out.mDesc.GetLengths()[0], - out.mDesc.GetLengths()[1], - out.mDesc.GetLengths()[2], - out.mDesc.GetLengths()[3]); + out_nkhw.mDesc.GetLengths()[0], + out_nkhw.mDesc.GetLengths()[1], + out_nkhw.mDesc.GetLengths()[2], + out_nkhw.mDesc.GetLengths()[3]); f_par(std::thread::hardware_concurrency()); } -template -void host_winograd_3x3_convolution( - const Tensor& in_nchw, const Tensor& wei_kcyx, Tensor& out, LowerPads, UpperPads) +template +void host_winograd_3x3_convolution(const Tensor& in_nchw, + const Tensor& wei_kcyx, + Tensor& out_nkhw, + LowerPads, + UpperPads) { constexpr std::size_t HoPerTile = 2; constexpr std::size_t WoPerTile = 2; @@ -162,8 +150,8 @@ void host_winograd_3x3_convolution( std::size_t Y = wei_kcyx.mDesc.GetLengths()[2]; std::size_t X = wei_kcyx.mDesc.GetLengths()[3]; - std::size_t HO = out.mDesc.GetLengths()[2]; - std::size_t WO = out.mDesc.GetLengths()[3]; + std::size_t HO = out_nkhw.mDesc.GetLengths()[2]; + std::size_t WO = out_nkhw.mDesc.GetLengths()[3]; unsigned h_pad_low = LowerPads{}.Get(Number<0>{}); unsigned w_pad_low = LowerPads{}.Get(Number<1>{}); @@ -177,11 +165,11 @@ void host_winograd_3x3_convolution( std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile; std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile; - Tensor in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); - Tensor in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); - Tensor wei_transform({K, C, HiPerTile, WiPerTile}); - Tensor out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile}); - Tensor out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile}); + Tensor in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); + Tensor in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); + Tensor wei_transform({K, C, HiPerTile, WiPerTile}); + Tensor out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile}); + Tensor out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile}); auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) { for(int j = 0; j < HiPerTile; ++j) @@ -198,7 +186,7 @@ void host_winograd_3x3_convolution( } else { - in_hold(n, c, htile, wtile, j, i) = T(0); + in_hold(n, c, htile, wtile, j, i) = TIn(0); } } } @@ -259,49 +247,61 @@ void host_winograd_3x3_convolution( }; auto f_wei_transform = [&](auto k, auto c) { - wei_transform(k, c, 0, 0) = wei_kcyx(k, c, 0, 0); - wei_transform(k, c, 0, 1) = - 0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2); - wei_transform(k, c, 0, 2) = - 0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2); - wei_transform(k, c, 0, 3) = wei_kcyx(k, c, 0, 2); + wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0)); + wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) + + 0.5 * double(wei_kcyx(k, c, 0, 1)) + + 0.5 * double(wei_kcyx(k, c, 0, 2)); + wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) - + 0.5 * double(wei_kcyx(k, c, 0, 1)) + + 0.5 * double(wei_kcyx(k, c, 0, 2)); + wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2)); - wei_transform(k, c, 1, 0) = - 0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 1, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) + - 0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 1, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) - - 0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 1, 3) = - 0.5 * wei_kcyx(k, c, 0, 2) + 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) + + 0.5 * double(wei_kcyx(k, c, 1, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 1, 1) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) + + 0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 1, 2) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) - + 0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) + + 0.5 * double(wei_kcyx(k, c, 1, 2)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); - wei_transform(k, c, 2, 0) = - 0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 2, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) - - 0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 2, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) + - 0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 2, 3) = - 0.5 * wei_kcyx(k, c, 0, 2) - 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) - + 0.5 * double(wei_kcyx(k, c, 1, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 2, 1) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) - + 0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 2, 2) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) + + 0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) - + 0.5 * double(wei_kcyx(k, c, 1, 2)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); - wei_transform(k, c, 3, 0) = wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 3, 1) = - 0.5 * wei_kcyx(k, c, 2, 0) + 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 3, 2) = - 0.5 * wei_kcyx(k, c, 2, 0) - 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 3, 3) = wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 1)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) - + 0.5 * double(wei_kcyx(k, c, 2, 1)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2)); }; auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) { @@ -354,7 +354,7 @@ void host_winograd_3x3_convolution( for(int i = 0; i < WoPerTile; ++i) { std::size_t wo = WoPerTile * wtile + i; - out(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); + out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); } } }; @@ -372,20 +372,25 @@ void host_winograd_3x3_convolution( template void check_error(const Tensor& ref, const Tensor& result) { + // printf("\n"); + float error = 0; float max_diff = -1; float ref_value = 0, result_value = 0; for(int i = 0; i < ref.mData.size(); ++i) { - error += std::abs(ref.mData[i] - result.mData[i]); - float diff = std::abs(ref.mData[i] - result.mData[i]); + error += std::abs(double(ref.mData[i]) - double(result.mData[i])); + float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); if(max_diff < diff) { max_diff = diff; ref_value = ref.mData[i]; result_value = result.mData[i]; } + + // printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i])); } + // printf("\n"); std::cout << "error: " << error << std::endl; std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; @@ -404,7 +409,7 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; -#elif 1 +#elif 0 // 3x3, 34x34 constexpr unsigned N = 64; constexpr unsigned C = 256; @@ -563,6 +568,30 @@ int main(int argc, char* argv[]) constexpr unsigned HPad = 2; constexpr unsigned WPad = 2; +#elif 0 + // 1x1 filter, 32x32 image + constexpr unsigned N = 64; + constexpr unsigned C = 256; + constexpr unsigned HI = 32; + constexpr unsigned WI = 32; + constexpr unsigned K = 512; + constexpr unsigned Y = 1; + constexpr unsigned X = 1; + + constexpr unsigned HPad = 0; + constexpr unsigned WPad = 0; +#elif 1 + // 1x1 filter, 14x14 image + constexpr unsigned N = 128; + constexpr unsigned C = 2048; + constexpr unsigned HI = 14; + constexpr unsigned WI = 14; + constexpr unsigned K = 512; + constexpr unsigned Y = 1; + constexpr unsigned X = 1; + + constexpr unsigned HPad = 0; + constexpr unsigned WPad = 0; #endif auto lower_pads = Sequence{}; @@ -577,10 +606,12 @@ int main(int argc, char* argv[]) ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); - Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); - Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); - Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); - Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); + using in_data_t = float; + using out_data_t = float; + Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); + Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); + Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); + Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); std::size_t num_thread = std::thread::hardware_concurrency(); @@ -601,9 +632,13 @@ int main(int argc, char* argv[]) #elif 1 in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); -#elif 1 - in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); +#elif 0 + in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + + auto gen_wei = [](auto... is) { + return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); + }; + wei_kcyx.GenerateTensorValue(gen_wei, num_thread); #endif } @@ -611,7 +646,9 @@ int main(int argc, char* argv[]) #if 0 device_direct_convolution_1 #elif 0 - device_direct_convolution_2 + device_direct_convolution_2_nchw_kcyx_nkhw +#elif 0 + device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 1 device_implicit_gemm_convolution_1_chwn_cyxk_khwn #elif 0 @@ -633,7 +670,6 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 1 if(Y == 3 && X == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); @@ -643,7 +679,6 @@ int main(int argc, char* argv[]) host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); } check_error(out_nkhw_host, out_nkhw_device); -#endif #if 0 LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp new file mode 100644 index 0000000000..89654cbc2b --- /dev/null +++ b/src/include/Array.hip.hpp @@ -0,0 +1,18 @@ +#pragma once + +template +struct Array +{ + using Type = Array; + + static constexpr unsigned nSize = NSize; + + unsigned mData[nSize]; + + template + __host__ __device__ Array(Xs... xs) : mData{static_cast(xs)...} + { + } + + __host__ __device__ TData operator[](unsigned i) const { return mData[i]; } +}; diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 2352b0f50c..2e5d237e81 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence struct ConstantTensorDescriptor { + using Type = ConstantTensorDescriptor; static constexpr unsigned nDim = Lengths::nDim; - using NDimConstant = Number; __host__ __device__ constexpr ConstantTensorDescriptor() { @@ -91,293 +91,70 @@ struct ConstantTensorDescriptor return Strides{}.Get(Number{}); } + // c++14 doesn't support constexpr lambdas, has to use this trick instead + struct GetElementSize_f + { + template + __host__ __device__ constexpr unsigned operator()(IDim idim) const + { + return Type{}.GetLength(idim); + } + }; + __host__ __device__ constexpr unsigned GetElementSize() const { - static_assert(nDim >= 2 && nDim <= 8, "nDim"); - - if(nDim == 2) + // c++14 doesn't support constexpr lambdas, has to use this trick instead + struct multiply { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; + __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const + { + return a * b; + } + }; - return GetLength(I0) * GetLength(I1); - } - else if(nDim == 3) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2); - } - else if(nDim == 4) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3); - } - else if(nDim == 5) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4); - } - else if(nDim == 6) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) * - GetLength(I5); - } - else if(nDim == 7) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) * - GetLength(I5) * GetLength(I6); - } - else if(nDim == 8) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - - return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) * - GetLength(I5) * GetLength(I6) * GetLength(I7); - } - else - { - assert(false); - } + return static_const_reduce_n{}(GetElementSize_f{}, multiply{}); } + // c++14 doesn't support constexpr lambdas, has to use this trick instead + struct GetElementSpace_f + { + template + __host__ __device__ constexpr unsigned operator()(IDim idim) const + { + return (Type{}.GetLength(idim) - 1) * Type{}.GetStride(idim); + } + }; + template > __host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const { - static_assert(nDim >= 2 && nDim <= 8, "nDim"); - - constexpr unsigned align_size = align.Get(); - - if(nDim == 2) + // c++14 doesn't support constexpr lambdas, has to use this trick instead + struct add { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; + __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const + { + return a + b; + } + }; - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - align_size; - } - else if(nDim == 3) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + align_size; - } - else if(nDim == 4) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - align_size; - } - else if(nDim == 5) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - (GetLength(I4) - 1) * GetStride(I4) + align_size; - } - else if(nDim == 6) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) + - align_size; - } - else if(nDim == 7) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) + - (GetLength(I6) - 1) * GetStride(I6) + align_size; - } - else if(nDim == 8) - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + - (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) + - (GetLength(I6) - 1) * GetStride(I6) + (GetLength(I7) - 1) * GetStride(I7) + - align_size; - } + return static_const_reduce_n{}(GetElementSpace_f{}, add{}) + align.Get(); } - // this is ugly, only for 2d - __host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1) const + template + __host__ __device__ unsigned Get1dIndex(Is... is) const { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; + static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong"); - static_assert(nDim == 2, "nDim is not 2"); - return i0 * GetStride(I0) + i1 * GetStride(I1); - } + const auto multi_id = Array(is...); - // this is ugly, only for 3d - __host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1, unsigned i2) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; + unsigned id = 0; - static_assert(nDim == 3, "nDim is not 3"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2); - } + static_loop_n{}([&](auto IDim) { + constexpr unsigned idim = IDim.Get(); + id += multi_id[idim] * GetStride(IDim); + }); - // this is ugly, only for 4d - __host__ __device__ unsigned - Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - static_assert(nDim == 4, "nDim is not 4"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3); - } - - // this is ugly, only for 5d - __host__ __device__ unsigned - Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - - static_assert(nDim == 5, "nDim is not 5"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) + - i4 * GetStride(I4); - } - - // this is ugly, only for 6d - __host__ __device__ unsigned - Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4, unsigned i5) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - - static_assert(nDim == 6, "nDim is not 6"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) + - i4 * GetStride(I4) + i5 * GetStride(I5); - } - - // this is ugly, only for 7d - __host__ __device__ unsigned Get1dIndex(unsigned i0, - unsigned i1, - unsigned i2, - unsigned i3, - unsigned i4, - unsigned i5, - unsigned i6) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - static_assert(nDim == 7, "nDim is not 7"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) + - i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6); - } - - // this is ugly, only for 8d - __host__ __device__ unsigned Get1dIndex(unsigned i0, - unsigned i1, - unsigned i2, - unsigned i3, - unsigned i4, - unsigned i5, - unsigned i6, - unsigned i7) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - - static_assert(nDim == 8, "nDim is not 8"); - return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) + - i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6) + i7 * GetStride(I7); + return id; } __host__ __device__ constexpr auto Condense() const @@ -385,6 +162,12 @@ struct ConstantTensorDescriptor constexpr auto default_strides = calculate_default_strides(Lengths{}); return ConstantTensorDescriptor{}; } + + template + __host__ __device__ constexpr auto Vectorize(Number, Number) const + { + assert(false); // not implemented + } }; template diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp new file mode 100644 index 0000000000..c8ca7a0f24 --- /dev/null +++ b/src/include/Sequence.hip.hpp @@ -0,0 +1,92 @@ +#pragma once +#include "constant_integral.hip.hpp" +#include "functional.hip.hpp" + +template +struct Sequence +{ + using Type = Sequence; + + static constexpr unsigned nDim = sizeof...(Is); + + const unsigned mData[nDim] = {Is...}; + + template + __host__ __device__ constexpr unsigned Get(Number) const + { + return mData[I]; + } + + // this is ugly, only for nDIm = 4 + template + __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence) const + { + static_assert(nDim == 4, "nDim != 4"); + + constexpr auto old_sequence = Type{}; + + constexpr unsigned NR0 = old_sequence.mData[I0]; + constexpr unsigned NR1 = old_sequence.mData[I1]; + constexpr unsigned NR2 = old_sequence.mData[I2]; + constexpr unsigned NR3 = old_sequence.mData[I3]; + + return Sequence{}; + } + + template + __host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence) const + { + // don't know how to implement this + printf("Sequence::ReorderByPutOldToNew not implemented"); + assert(false); + } + + template + __host__ __device__ constexpr auto PushBack(Number) const + { + return Sequence{}; + } + + __host__ __device__ constexpr auto PopBack() const; + + template + __host__ __device__ constexpr auto Transform(F f) const + { + return Sequence{}; + } +}; + +template +__host__ __device__ constexpr auto sequence_pop_back(Sequence) +{ + static_assert(sizeof...(Is) >= 1, "empty Sequence!"); + return Sequence{}; +} + +template +__host__ __device__ constexpr auto sequence_sequence_op(Sequence, Sequence, F f) +{ + static_assert(Sequence::nDim == Sequence::nDim, "Dim not the same"); + + return Sequence{}; +} + +template +__host__ __device__ constexpr auto sequence_sequence_add(Sequence, Sequence) +{ + struct add + { + __host__ __device__ constexpr unsigned operator()(unsigned x, unsigned y) const + { + return x + y; + } + }; + + return sequence_sequence_op(Sequence{}, Sequence{}, add{}); +} + +template +__host__ __device__ constexpr auto Sequence::PopBack() const +{ + return sequence_pop_back(Type{}); +} diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index b54c4d0c5f..ce3a7a37b9 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -373,7 +373,7 @@ template struct Blockwise2dTensorCopy3 { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::MemoryType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; @@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3 constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; - static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1, - "wrong! only support stride1 == 1!\n"); + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1), + "wrong! only support stride1 == 1 if DataPerRead > 1!\n"); static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 9087364b71..0660c34ebb 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds did[3] = is / ref_desc.GetStride(I3); - const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); - const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); - f(p_src[aindex], p_dst[bindex]); + f(p_src[src_index], p_dst[dst_index]); } constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); @@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds did[3] = is / ref_desc.GetStride(I3); - const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); - const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); - f(p_src[aindex], p_dst[bindex]); + f(p_src[src_index], p_dst[dst_index]); } } } @@ -199,15 +199,110 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy); } -template +template struct Blockwise4dTensorCopy1 { + using vector_t = typename vector_type::MemoryType; + + __device__ constexpr Blockwise4dTensorCopy1() + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1), + "wrong! only support stride3 == 1 if DataPerRead > 1!\n"); + + static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, + "wrong! only support DataPerRead == 1, 2 or 4!\n"); + + static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 && + DstDesc{}.GetStride(I2) % DataPerRead == 0, + "src and dst stride2 should be multiple of DataPerRead to keep alignment"); + + // we allow out-of-bound read from src in D3 dimension, + // but we need to make sure dst stride2 is big enough, + // so that the out-of-bound write won't contaminate next line in dst + constexpr unsigned L3 = CopyLengths{}.Get(I3); + constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead); + + static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2), + "wrong! out-of-bound write will contaminate next line!\n"); + } + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; - blockwise_4d_tensor_copy_reorder_by_get_dst_from_src( - SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder); + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr unsigned L0 = CopyLengths{}.Get(I0); + constexpr unsigned L1 = CopyLengths{}.Get(I1); + constexpr unsigned L2 = CopyLengths{}.Get(I2); + constexpr unsigned L3 = CopyLengths{}.Get(I3); + + constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead); + + constexpr auto ref_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; + + auto f_copy = [&](unsigned is) { + unsigned did[4]; + + did[0] = is / ref_desc.GetStride(I0); + + is -= did[0] * ref_desc.GetStride(I0); + + did[1] = is / ref_desc.GetStride(I1); + + is -= did[1] * ref_desc.GetStride(I1); + + did[2] = is / ref_desc.GetStride(I2); + + is -= did[2] * ref_desc.GetStride(I2); + + did[3] = is / ref_desc.GetStride(I3); + + const unsigned src_index = + src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); + const unsigned dst_index = + dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); + + *(reinterpret_cast(p_dst + dst_index)) = + *(reinterpret_cast(p_src + src_index)); + }; + + for(unsigned iloop = 0; iloop < NLoop; ++iloop) + { + unsigned is = threadIdx.x + iloop * BlockSize; + + f_copy(is); + } + + constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); + + if(has_tail) + { + unsigned is = threadIdx.x + NLoop * BlockSize; + + if(is < ref_desc.GetElementSize()) + { + f_copy(is); + } + } } }; @@ -350,7 +445,7 @@ template struct Blockwise4dTensorCopy3 { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::MemoryType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; @@ -362,8 +457,9 @@ struct Blockwise4dTensorCopy3 constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1, - "wrong! only support stride3 == 1!\n"); + static_assert(DataPerRead == 1 || + (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1), + "wrong! only support stride3 == 1 if DataPerRead > 1!\n"); static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); @@ -371,7 +467,7 @@ struct Blockwise4dTensorCopy3 static_assert( SrcDesc{}.GetStride(I2) % DataPerRead == 0 && DstDesc{}.GetStride(I2) % DataPerRead == 0, - "wrong! src and dst stride should be multiple of DataPerRead to keep alignment"); + "wrong! src and dst stride2 should be multiple of DataPerRead to keep alignment"); constexpr unsigned L0 = CopyLengths{}.Get(I0); constexpr unsigned L1 = CopyLengths{}.Get(I1); diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index b44bb797b3..9a18ca5fd7 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 #pragma unroll for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) { -// read first batch of A, B -// copy A-sub to form A -#pragma unroll + // read first batch of A, B + // copy A-sub to form A + //#pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { +#if 0 threadwise_matrix_copy( a_block_mtx, p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + @@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 a_thread_mtx, p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), a_thread_sub_mtx.GetLengths()); +#else + for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) + { + p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] = + p_a_block[a_block_mtx.Get1dIndex(k_begin + i, + m_repeat * MPerLevel1Cluster + j) + + mMyThreadOffsetA]; + } + } +#endif } -// copy B-sub to form B -#pragma unroll + // copy B-sub to form B + //#pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { +#if 0 threadwise_matrix_copy( b_block_mtx, p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + @@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 b_thread_mtx, p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), b_thread_sub_mtx.GetLengths()); +#else + for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) + { + p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] = + p_b_block[b_block_mtx.Get1dIndex(k_begin + i, + n_repeat * MPerLevel1Cluster + j) + + mMyThreadOffsetB]; + } + } +#endif } -// loop over batch -#pragma unroll + // loop over batch + //#pragma unroll for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) { - // do current batch of gemm +// do current batch of gemm +#if 0 threadwise_gemm(a_thread_mtx, True, p_a_thread, @@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 False, p_c_thread + ib * ThreadMatrixStrideC, f_accum); +#else + for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k) + { + for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j) + { + const unsigned aindex = + a_thread_mtx.Get1dIndex(k, i); // A is transposed + const unsigned bindex = b_thread_mtx.Get1dIndex(k, j); + const unsigned cindex = + c_thread_mtx.Get1dIndex(i, j) + ib * ThreadMatrixStrideC; + + f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); + } + } + } +#endif // read next batch of a, b if(BlockMatrixStrideA != 0) { -#pragma unroll + //#pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { +#if 0 threadwise_matrix_copy( a_block_mtx, p_a_block + @@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 a_thread_mtx, p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), a_thread_sub_mtx.GetLengths()); +#else + for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j) + { + p_a_thread[a_thread_mtx.Get1dIndex(i, + m_repeat * MPerThreadSubC + j)] = + p_a_block[a_block_mtx.Get1dIndex( + k_begin + i, m_repeat * MPerLevel1Cluster + j) + + (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA]; + } + } +#endif } } if(BlockMatrixStrideB != 0) { -#pragma unroll + //#pragma unroll for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { +#if 0 threadwise_matrix_copy( b_block_mtx, p_b_block + @@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 b_thread_mtx, p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), b_thread_sub_mtx.GetLengths()); +#else + for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j) + { + p_b_thread[b_thread_mtx.Get1dIndex(i, + n_repeat * NPerThreadSubC + j)] = + p_b_block[b_block_mtx.Get1dIndex( + k_begin + i, n_repeat * MPerLevel1Cluster + j) + + (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB]; + } + } +#endif } } } - // do last batch of gemm +// do last batch of gemm +#if 0 threadwise_gemm(a_thread_mtx, True, p_a_thread, @@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 False, p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC, f_accum); +#else + for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k) + { + for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i) + { + for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j) + { + const unsigned aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed + const unsigned bindex = b_thread_mtx.Get1dIndex(k, j); + const unsigned cindex = c_thread_mtx.Get1dIndex(i, j) + + (BatchPerThread - 1) * ThreadMatrixStrideC; + + f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]); + } + } + } +#endif } } @@ -551,9 +642,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 c_thread_mtx_begin.batch * BlockMatrixStrideC + c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col); - for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat) + for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { - for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat) + for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { threadwise_matrix_copy( c_thread_sub_mtx, diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 13cab61b46..a6c9d128e8 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -1,4 +1,9 @@ #pragma once +#include "data_type.hip.hpp" +#include "constant_integral.hip.hpp" +#include "Sequence.hip.hpp" +#include "Array.hip.hpp" +#include "functional.hip.hpp" __device__ unsigned get_thread_local_1d_id() { return threadIdx.x; } @@ -16,129 +21,6 @@ struct is_same static const bool value = true; }; -template -struct vector_type -{ -}; - -template <> -struct vector_type -{ - using type = float; -}; - -template <> -struct vector_type -{ - using type = float2; -}; - -template <> -struct vector_type -{ - using type = float4; -}; - -#if 0 -template <> -struct vector_type -{ - using type = half_float::half; -}; - -template <> -struct vector_type -{ - using type = float; -}; - -template <> -struct vector_type -{ - using type = float2; -}; - -template <> -struct vector_type -{ - using type = float4; -}; -#endif - -#if 0 -template <> -struct vector_type -{ - using type = half; -}; - -template <> -struct vector_type -{ - using type = half2; -}; - -template <> -struct vector_type -{ - using type = float2; -}; - -template <> -struct vector_type -{ - using type = float4; -}; -#endif - -template -struct integral_constant -{ - static const T value = N; - - __host__ __device__ constexpr T Get() const { return value; } -}; - -template -using Number = integral_constant; - -template -struct Sequence -{ - using Type = Sequence; - - static constexpr unsigned nDim = sizeof...(Is); - - const unsigned mData[nDim] = {Is...}; - - template - __host__ __device__ constexpr unsigned Get(Number) const - { - return mData[I]; - } - - template - __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence) const - { - constexpr auto old_sequence = Type{}; - - constexpr unsigned NR0 = old_sequence.mData[I0]; - constexpr unsigned NR1 = old_sequence.mData[I1]; - constexpr unsigned NR2 = old_sequence.mData[I2]; - constexpr unsigned NR3 = old_sequence.mData[I3]; - - return Sequence{}; - } - - template - __host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence) const - { - // don't know how to implement this - printf("Sequence::ReorderByPutOldToNew not implemented"); - assert(false); - } -}; - #if DEVICE_BACKEND_CUDA template __host__ __device__ constexpr T max(T a, T b) diff --git a/src/include/config.h.in b/src/include/config.h.in index 1c6e325ff9..bb4f6cb51d 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -4,9 +4,10 @@ #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" +#include "hip/hip_fp16.h" #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" +#include "cuda_fp16.h" #include "nvToolsExt.h" #include "helper_cuda.h" -#include "cuda_fp16.h" #endif diff --git a/src/include/constant_integral.hip.hpp b/src/include/constant_integral.hip.hpp new file mode 100644 index 0000000000..70dc69d181 --- /dev/null +++ b/src/include/constant_integral.hip.hpp @@ -0,0 +1,12 @@ +#pragma once + +template +struct integral_constant +{ + static const T value = N; + + __host__ __device__ constexpr T Get() const { return value; } +}; + +template +using Number = integral_constant; diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp new file mode 100644 index 0000000000..95d5b0b33f --- /dev/null +++ b/src/include/data_type.hip.hpp @@ -0,0 +1,214 @@ +#pragma once +#include "config.h" + +template +struct vector_type +{ +}; + +template <> +struct vector_type +{ + using MemoryType = float; +}; + +template <> +struct vector_type +{ + using MemoryType = float2; + + __host__ __device__ static MemoryType Pack(float s0, float s1) + { + union + { + MemoryType vector; + float scalar[2]; + } data; + + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = float4; +}; + +template <> +struct vector_type +{ + using MemoryType = float4; +}; + +#if 0 +template <> +struct vector_type +{ + using MemoryType = half; + + __host__ __device__ static MemoryType Pack(half s) { return s; } +}; + +template <> +struct vector_type +{ + using MemoryType = half2; + + __host__ __device__ static MemoryType Pack(half s0, half s1) + { + union + { + MemoryType vector; + half scalar[2]; + } data; + + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = float2; +}; + +template <> +struct vector_type +{ + using MemoryType = float4; +}; + +template <> +struct vector_type +{ + using MemoryType = char; + + __host__ __device__ static MemoryType Pack(char s) { return s; } +}; + +template <> +struct vector_type +{ + using MemoryType = int16_t; + + __host__ __device__ static MemoryType Pack(char s0, char s1) + { + union + { + MemoryType vector; + char scalar[2]; + } data; + + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = int32_t; + + __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3) + { + union + { + MemoryType vector; + char scalar[4]; + } data; + + data.scalar[0] = s0; + data.scalar[1] = s1; + data.scalar[2] = s2; + data.scalar[3] = s3; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = int64_t; +}; + +template <> +struct vector_type +{ + using MemoryType = int64_t; +}; + +template <> +struct vector_type +{ + using MemoryType = char4; +}; + +template <> +struct vector_type +{ + using MemoryType = int64_t; +}; + +template <> +struct vector_type +{ + using MemoryType = int; +}; + +template <> +struct vector_type +{ + using MemoryType = int64_t; +}; +#endif + +__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) +{ + d += s0 * s1; +} + +__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1) +{ + d += s0.x * s1.x; + d += s0.y * s1.y; +} + +__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1) +{ + d += s0.x * s1.x; + d += s0.y * s1.y; + d += s0.z * s1.z; + d += s0.w * s1.w; +} + +#if 0 +__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } + +__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) +{ + d += s0.x * s1.x; + d += s0.y * s1.y; +} + +__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) +{ + d += s0.x * s1.x + s0.y * s1.y; +} + +__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; } + +// TODO:: this interface is misleading, s0, s1 are actually int8x4 +// need to make a better interface +__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) +{ +#if DEVICE_BACKEND_CUDA + d = __dp4a(s0, s1, d); +#endif +} +#endif diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp new file mode 100644 index 0000000000..d3f645eaae --- /dev/null +++ b/src/include/functional.hip.hpp @@ -0,0 +1,57 @@ +#pragma once +#include "constant_integral.hip.hpp" + +template +struct static_loop_n +{ + template + __host__ __device__ void operator()(F f) const + { + static_assert(NLoop > 1, "out-of-range"); + + f(Number{}); + static_loop_n{}(f); + } +}; + +template <> +struct static_loop_n<1> +{ + template + __host__ __device__ void operator()(F f) const + { + f(Number<0>{}); + } +}; + +template +struct static_const_reduce_n +{ + template + __host__ __device__ constexpr auto operator()(F f, Reduce r) const + { + static_assert(NLoop > 1, "out-of-range"); + + constexpr auto a = f(Number{}); + auto b = static_const_reduce_n{}(f, r); // cannot use constexpr here, weird + return r(a, b); + } +}; + +template <> +struct static_const_reduce_n<1> +{ + template + __host__ __device__ constexpr auto operator()(F f, Reduce) const + { + return f(Number<0>{}); + } +}; + +#if 0 +template +__host__ __device__ constexpr auto unpacker(F f) +{ + return [=](auto xs_array){ f(xs...); }; +} +#endif \ No newline at end of file diff --git a/src/include/gridwise_direct_convolution_2.hip.hpp b/src/include/gridwise_direct_convolution_2.hip.hpp deleted file mode 100644 index bb65f1d156..0000000000 --- a/src/include/gridwise_direct_convolution_2.hip.hpp +++ /dev/null @@ -1,198 +0,0 @@ -#pragma once -#include "common.hip.hpp" -#include "ConstantTensorDescriptor.hip.hpp" -#include "blockwise_4d_tensor_op.hip.hpp" -#include "blockwise_direct_convolution.hip.hpp" -#include "threadwise_4d_tensor_op.hip.hpp" -#include "threadwise_direct_convolution.hip.hpp" - -template -__global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_global_desc = InGlobalDesc{}; - constexpr auto wei_global_desc = WeiGlobalDesc{}; - constexpr auto out_global_desc = OutGlobalDesc{}; - - constexpr unsigned Y = wei_global_desc.GetLength(I2); - constexpr unsigned X = wei_global_desc.GetLength(I3); - - constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; - constexpr unsigned WiPerBlock = WoPerBlock + X - 1; - - constexpr auto in_block_desc = - make_ConstantTensorDescriptor(Sequence{}); - - constexpr auto wei_block_desc = - make_ConstantTensorDescriptor(Sequence{}); - - // shared mem - constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); - constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); - - __shared__ Float p_in_block[in_block_size]; - __shared__ Float p_wei_block[wei_block_size]; - - // threadwise tensors - constexpr unsigned HiPerThread = HoPerThread + Y - 1; - constexpr unsigned WiPerThread = WoPerThread + X - 1; - - constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, in_block_desc.GetStrides()); - - constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_block_desc.GetStrides()); - - constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor( - in_thread_block_desc, wei_thread_block_desc); - - // register - Float p_out_thread[out_thread_desc.GetElementSpace()]; - - // divide block work - constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; - constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; - constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; - constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; - - const unsigned block_id = blockIdx.x; - - unsigned itmp = block_id; - const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork); - itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork); - const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork); - itmp -= k_block_work_id * (HBlockWork * WBlockWork); - const unsigned h_block_work_id = itmp / WBlockWork; - const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork; - - const unsigned n_block_data_begin = n_block_work_id * NPerBlock; - const unsigned k_block_data_begin = k_block_work_id * KPerBlock; - const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; - const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; - - const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding - const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding - - // divide thread work - constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; - constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread; - constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread; - constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread; - - const unsigned thread_id = threadIdx.x; - - itmp = thread_id; - const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork); - itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork); - const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork); - itmp -= k_thread_work_id * (HThreadWork * WThreadWork); - const unsigned h_thread_work_id = itmp / WThreadWork; - const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork; - - const unsigned n_thread_data_begin = n_thread_work_id * NPerThread; - const unsigned k_thread_data_begin = k_thread_work_id * KPerThread; - const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread; - const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread; - - const unsigned hi_thread_data_begin = ho_thread_data_begin; - const unsigned wi_thread_data_begin = wo_thread_data_begin; - - constexpr auto blockwise_in_copy = - Blockwise4dTensorCopy1{}; - - constexpr auto blockwise_wei_copy = - Blockwise4dTensorCopy1{}; - - // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); - - for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1); - c_block_data_begin += CPerBlock, __syncthreads()) - { - // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + - in_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), - p_in_block); - - // copy weight tensor to LDS - blockwise_wei_copy.Run( - p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); - - __syncthreads(); - - for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) - { -// threadwise convolution -#if 1 - threadwise_direct_convolution_2( - in_thread_block_desc, - p_in_block + - in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, - p_out_thread); -#elif 0 - threadwise_direct_convolution_3( - in_thread_block_desc, - p_in_block + - in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, - p_out_thread); -#endif - } - } - - // copy output tensor from register to global mem - threadwise_4d_tensor_copy( - out_thread_desc, - p_out_thread, - out_global_desc, - p_out_global + - out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), - out_thread_desc.GetLengths()); -} diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp new file mode 100644 index 0000000000..1e6d3d24bd --- /dev/null +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -0,0 +1,238 @@ +#pragma once +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_direct_convolution.hip.hpp" +#include "threadwise_4d_tensor_op.hip.hpp" +#include "threadwise_direct_convolution.hip.hpp" + +template +__global__ void +gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_nchw_global_desc = InGlobalDesc{}; + constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{}; + constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; + + constexpr unsigned N = in_nchw_global_desc.GetLength(I0); + constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3); + + constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor( + Sequence{}); // 2d view of wei for blockwise copy + + constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; + constexpr unsigned WiPerBlock = WoPerBlock + X - 1; + + constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, + Number{}); // 2d view of wei for blockwise copy + + constexpr auto wei_kcyx_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + Sequence{}); + + // shared mem + constexpr unsigned in_block_size = + in_nchw_block_desc.GetElementSpace(Number{}); + constexpr unsigned wei_block_size = + wei_kcyx_block_desc.GetElementSpace(Number{}); + + constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; + + // threadwise tensors + constexpr unsigned HiPerThread = HoPerThread + Y - 1; + constexpr unsigned WiPerThread = WoPerThread + X - 1; + + constexpr auto in_nchw_thread_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + in_nchw_block_desc.GetStrides()); + + constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, wei_kcyx_block_desc.GetStrides()); + + constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( + in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); + + // register + Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; + + // divide block work + constexpr unsigned NBlockWork = + (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned KBlockWork = + (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned HBlockWork = + (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; + constexpr unsigned WBlockWork = + (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; + + const unsigned block_id = blockIdx.x; + + unsigned itmp = block_id; + const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork); + itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork); + const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork); + itmp -= k_block_work_id * (HBlockWork * WBlockWork); + const unsigned h_block_work_id = itmp / WBlockWork; + const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork; + + const unsigned n_block_data_begin = n_block_work_id * NPerBlock; + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; + const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; + + const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding + const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding + + // divide thread work + constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; + constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread; + constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread; + constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread; + + const unsigned thread_id = threadIdx.x; + + itmp = thread_id; + const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork); + itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork); + const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork); + itmp -= k_thread_work_id * (HThreadWork * WThreadWork); + const unsigned h_thread_work_id = itmp / WThreadWork; + const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork; + + const unsigned n_thread_data_begin = n_thread_work_id * NPerThread; + const unsigned k_thread_data_begin = k_thread_work_id * KPerThread; + const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread; + const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread; + + const unsigned hi_thread_data_begin = ho_thread_data_begin; + const unsigned wi_thread_data_begin = wo_thread_data_begin; + + constexpr auto blockwise_in_copy = + Blockwise4dTensorCopy1{}; + +#if 0 + constexpr auto blockwise_wei_copy = + Blockwise4dTensorCopy1{}; +#elif 1 + const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; +#endif + + // set threadwise output tensor to 0 + threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread); + + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; + c_block_data_begin += CPerBlock, __syncthreads()) + { + // copy input tensor to LDS + blockwise_in_copy.Run(p_in_global + + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), + p_in_block); + + // copy weight tensor to LDS + blockwise_wei_copy.Run( + p_wei_global + + wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); + + __syncthreads(); + + for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) + { +// threadwise convolution +#if 1 + threadwise_direct_convolution_2( + in_nchw_thread_block_desc, + p_in_block + + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_thread_block_desc, + p_wei_block + + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, + p_out_thread); +#elif 0 + threadwise_direct_convolution_3( + in_nchw_thread_block_desc, + p_in_block + + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_thread_block_desc, + p_wei_block + + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, + p_out_thread); +#endif + } + } + + // copy output tensor from register to global mem + threadwise_4d_tensor_copy( + out_nkhw_thread_desc, + p_out_thread, + out_nkhw_global_desc, + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_nkhw_thread_desc.GetLengths()); +} diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp new file mode 100644 index 0000000000..4d72368b29 --- /dev/null +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -0,0 +1,254 @@ +#pragma once +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_direct_convolution.hip.hpp" +#include "threadwise_4d_tensor_op.hip.hpp" +#include "threadwise_direct_convolution.hip.hpp" + +template +__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( + const typename vector_type::MemoryType* const __restrict__ p_in_vec_global, + const typename vector_type::MemoryType* const __restrict__ p_wei_vec_global, + TOut* const __restrict__ p_out_global) +{ + using in_scalar_t = TInWei; + using in_vector_mem_t = typename vector_type::MemoryType; + using out_scalar_t = TOut; + using accum_t = TAccum; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_nchw_vec_global_desc = InGlobalDesc{}; + constexpr auto wei_kcyx_vec_global_desc = WeiGlobalDesc{}; + constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; + + constexpr unsigned N = in_nchw_vec_global_desc.GetLength(I0); + constexpr unsigned K = wei_kcyx_vec_global_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_vec_global_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_vec_global_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_vec_global_desc.GetLength(I3); + + constexpr auto wei_ke_vec_global_desc = make_ConstantTensorDescriptor( + Sequence{}); // 2d view of wei for blockwise copy + + constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; + constexpr unsigned WiPerBlock = WoPerBlock + X - 1; + + constexpr auto in_nchw_vec_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_ke_vec_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, + Number{}); // 2d view of wei for blockwise copy + + constexpr auto wei_kcyx_vec_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + Sequence{}); + + // shared mem + constexpr unsigned in_block_size = + in_nchw_vec_block_desc.GetElementSpace(Number{}); + + constexpr unsigned wei_block_size = + wei_kcyx_vec_block_desc.GetElementSpace(Number{}); + + constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + __shared__ in_vector_mem_t + p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ in_vector_mem_t + p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; + + // threadwise tensors + constexpr unsigned HiPerThread = HoPerThread + Y - 1; + constexpr unsigned WiPerThread = WoPerThread + X - 1; + + constexpr auto in_nchw_vec_thread_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + in_nchw_vec_block_desc.GetStrides()); + + constexpr auto wei_kcyx_vec_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, wei_kcyx_vec_block_desc.GetStrides()); + + constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( + in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc); + + // register + out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; + + // divide block work + constexpr unsigned NBlockWork = + (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned KBlockWork = + (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned HBlockWork = + (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; + constexpr unsigned WBlockWork = + (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; + + const unsigned block_id = blockIdx.x; + + unsigned itmp = block_id; + const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork); + itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork); + const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork); + itmp -= k_block_work_id * (HBlockWork * WBlockWork); + const unsigned h_block_work_id = itmp / WBlockWork; + const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork; + + const unsigned n_block_data_begin = n_block_work_id * NPerBlock; + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; + const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; + + const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding + const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding + + // divide thread work + constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; + constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread; + constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread; + constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread; + + const unsigned thread_id = threadIdx.x; + + itmp = thread_id; + const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork); + itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork); + const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork); + itmp -= k_thread_work_id * (HThreadWork * WThreadWork); + const unsigned h_thread_work_id = itmp / WThreadWork; + const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork; + + const unsigned n_thread_data_begin = n_thread_work_id * NPerThread; + const unsigned k_thread_data_begin = k_thread_work_id * KPerThread; + const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread; + const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread; + + const unsigned hi_thread_data_begin = ho_thread_data_begin; + const unsigned wi_thread_data_begin = wo_thread_data_begin; + + constexpr auto blockwise_in_copy = + Blockwise4dTensorCopy1{}; + +#if 0 + constexpr auto blockwise_wei_copy = + Blockwise4dTensorCopy1{}; +#elif 1 + const auto blockwise_wei_copy = + Blockwise2dTensorCopy3{}; +#endif + +#if 1 // debug + // set threadwise output tensor to 0 + threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread); +#endif + + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; + c_block_data_begin += CPerBlock, __syncthreads()) + { + // copy input tensor to LDS + blockwise_in_copy.Run(p_in_vec_global + + in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), + p_in_vec_block); + + // copy weight tensor to LDS + blockwise_wei_copy.Run( + p_wei_vec_global + + wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_vec_block); + + __syncthreads(); + + for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) + { +// threadwise convolution +#if 1 + threadwise_direct_convolution_2( + in_nchw_vec_thread_block_desc, + p_in_vec_block + + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_vec_thread_block_desc, + p_wei_vec_block + + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, + p_out_thread); +#elif 0 + threadwise_direct_convolution_3( + in_nchw_vec_thread_block_desc, + p_in_vec_block + + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_kcyx_vec_thread_block_desc, + p_wei_vec_block + + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_nkhw_thread_desc, + p_out_thread); +#endif + } + } + + // copy output tensor from register to global mem + threadwise_4d_tensor_copy( + out_nkhw_thread_desc, + p_out_thread, + out_nkhw_global_desc, + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_nkhw_thread_desc.GetLengths()); +} diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp new file mode 100644 index 0000000000..f68b57b6a0 --- /dev/null +++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp @@ -0,0 +1,281 @@ +#pragma once +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "ConstantMatrixDescriptor.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" +#include "threadwise_2d_tensor_op.hip.hpp" +#include "blockwise_gemm.hip.hpp" + +// define B = flatten(N, Hi, Wi) +template +__global__ void +gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_chwn_global_desc = InGlobalDesc{}; + constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{}; + constexpr auto out_khwn_global_desc = OutGlobalDesc{}; + + constexpr unsigned C = in_chwn_global_desc.GetLength(I0); + constexpr unsigned Hi = in_chwn_global_desc.GetLength(I1); + constexpr unsigned Wi = in_chwn_global_desc.GetLength(I2); + constexpr unsigned N = in_chwn_global_desc.GetLength(I3); + + constexpr unsigned K = out_khwn_global_desc.GetLength(I0); + constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1); + constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2); + + constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1); + constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2); + + constexpr unsigned B = N * Hi * Wi; + constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1); + + // divide block work by 2d: [K, B] + constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock; + constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock; + + const unsigned k_block_work_id = get_block_1d_id() / BBlockWork; + const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork; + + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned b_block_data_begin = b_block_work_id * BPerBlock; + + // flattend (2d) tensor view of gridwise input + constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of blockwise input and weight + // be careful of alignment + constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + // tensor view of threadwise output in register + constexpr auto out_kb_thread_desc = + make_ConstantTensorDescriptor(Sequence{}); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc"); + print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc"); + print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc"); + + print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc"); + print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc"); + + print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); + print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc"); + print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc"); + print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); + + printf("KPerBlock %u\n", KPerBlock); + } +#endif + +// blockwise in copy +// formmat is [CPerBlock,BPerBlock + BGhostRead] +#if 0 + const auto blockwise_in_copy = + Blockwise2dTensorCopy1{}; +#elif 0 + const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; +#elif 1 + const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; +#endif + +// blockwise wei copy +// format is [CPerBlock*Y*X,KPerBlock] +#if 0 + const auto blockwise_wei_copy = + Blockwise2dTensorCopy1{}; +#elif 0 + const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}; +#elif 1 + const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; +#endif + + // a series of blockwise GEMM + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx and b_mtx saved in LDS, c_mtx saved in register + // a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K] + // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] + // c_mtx[K,B] is out_block[K,B] + constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto c_kxb_thread_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, Number{}); + + const auto blockwise_gemm = + BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2{}; + + // LDS: be careful of alignment + constexpr unsigned in_block_size = + in_cb_block_desc.GetElementSpace(Number{}); + + constexpr unsigned wei_block_size = + wei_cyxk_block_desc.GetElementSpace(Number{}); + + constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + // LDS + __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; + + const Float* p_in_global_block_offset = + p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); + + const Float* p_wei_global_block_offset = + p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + + // register + Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; + + // set threadwise output tensor to 0 + threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); + + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0), + p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0), + __syncthreads()) + { + // load data + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); + + __syncthreads(); + + // compute on current data + // a series of GEMM + for(unsigned y = 0; y < Y; ++y) + { + for(unsigned x = 0; x < X; ++x) + { + auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; +#if 1 + blockwise_gemm.Run +#else + blockwise_gemm.Run_RegisterDoubleBuffer +#endif + (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block + y * Wi + x, + p_out_thread, + f_accum); + } + } + } + + // output: register to global mem, + const auto c_thread_mtx_begin = + blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row; + const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col; + + for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k) + { + for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b) + { + const auto c_thread_mtx_distance = + blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b); + + unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row; + unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col; + + unsigned h_data = b_data / (Wi * N); + unsigned itmp = b_data - h_data * (Wi * N); + unsigned w_data = itmp / N; + unsigned n_data = itmp - w_data * N; + + if(n_data < N && h_data < Ho && w_data < Wo) + { + p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] = + p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; + } + } + } +} diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 37774d360f..728a2d56ba 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b __syncthreads(); // load next data -#if 1 +#if 0 blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next); #elif 1 @@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b } } -#if 0 +#if 1 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next); #endif diff --git a/src/include/threadwise_4d_tensor_op.hip.hpp b/src/include/threadwise_4d_tensor_op.hip.hpp index 3d13ae2aa6..5b908d3ac6 100644 --- a/src/include/threadwise_4d_tensor_op.hip.hpp +++ b/src/include/threadwise_4d_tensor_op.hip.hpp @@ -37,7 +37,8 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re // TODO: in order to optimize mem access for different mem type, // need to write specialized version -template __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc, - const Float* __restrict__ p_src, + const SrcData* __restrict__ p_src, DstDesc, - Float* __restrict__ p_dst, + DstData* __restrict__ p_dst, SrcOpLengths, DstFromSrcReorder, F f) @@ -88,33 +89,38 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_d } } -template -__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p) +template +__device__ void threadwise_4d_tensor_set_zero(Desc, Data* __restrict__ p) { - auto f_set_zero = [](Float& v) { v = Float(0); }; + auto f_set_zero = [](Data& v) { v = Data(0); }; - threadwise_4d_tensor_pointwise_operation_unary( + threadwise_4d_tensor_pointwise_operation_unary( Desc{}, p, f_set_zero); } -template +template __device__ void threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, - const Float* __restrict__ p_src, + const SrcData* __restrict__ p_src, DstDesc, - Float* __restrict__ p_dst, + DstData* __restrict__ p_dst, SrcOpLengths, DstFromSrcReorder) { - auto f_copy = [](const Float& src, Float& dst) { dst = src; }; + auto f_copy = [](const SrcData& src, DstData& dst) { dst = static_cast(src); }; threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy); } -template +template __device__ void threadwise_4d_tensor_copy( - SrcDesc, const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths) + SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths) { auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp index 32d446491b..b9a509d6a0 100644 --- a/src/include/threadwise_direct_convolution.hip.hpp +++ b/src/include/threadwise_direct_convolution.hip.hpp @@ -2,13 +2,13 @@ #include "ConstantTensorDescriptor.hip.hpp" // optimized for scenario if p_in, p_wei, p_out are in register -template +template __device__ void threadwise_direct_convolution_1(InDesc, - Float* const __restrict__ p_in, + TInWei* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + TInWei* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + TOut* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -51,25 +51,8 @@ __device__ void threadwise_direct_convolution_1(InDesc, const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo); - p_out[out_index] += p_wei[wei_index] * p_in[in_index]; - -#if 0 - // if(threadIdx.x == 0) - { - printf("threadwise_direct_convolution: \t" - "threadIdx.x %u\t" - "out_index %u, p_out[out_index] %f, \t" - "wei_index %u, p_wei[wei_index] %f, \t" - "in_index %u, p_in[in_index] %f\n", - threadIdx.x, - out_index, - p_out[out_index], - wei_index, - p_wei[wei_index], - in_index, - p_in[in_index]); - } -#endif + fused_multiply_accumulate( + p_out[out_index], p_wei[wei_index], p_in[in_index]); } } } @@ -81,13 +64,13 @@ __device__ void threadwise_direct_convolution_1(InDesc, // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register // Copy in and wei into register before doing convolution -template +template __device__ void threadwise_direct_convolution_2(InDesc, - Float* const __restrict__ p_in, + TInWei* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + TInWei* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + TOut* __restrict__ p_out) { constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; @@ -97,8 +80,8 @@ __device__ void threadwise_direct_convolution_2(InDesc, constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths()); // register - Float p_in_reg[in_reg_desc.GetElementSpace()]; - Float p_wei_reg[wei_reg_desc.GetElementSpace()]; + TInWei p_in_reg[in_reg_desc.GetElementSpace()]; + TInWei p_wei_reg[wei_reg_desc.GetElementSpace()]; // copy input tensor into register threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths()); @@ -114,13 +97,13 @@ __device__ void threadwise_direct_convolution_2(InDesc, // optimized for scenario where p_in and p_wei are in LDS, p_out is in register // break down a non-1x1 convolution into a sequence of 1x1 convolutions, // load 1x1 weight into register, and do 1x1 convolution in register. -template +template __device__ void threadwise_direct_convolution_3(InDesc, - Float* const __restrict__ p_in, + Data* const __restrict__ p_in, WeiDesc, - Float* const __restrict__ p_wei, + Data* const __restrict__ p_wei, OutDesc, - Float* __restrict__ p_out) + Data* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -139,8 +122,8 @@ __device__ void threadwise_direct_convolution_3(InDesc, constexpr auto wei_reg_desc = make_ConstantTensorDescriptor( Sequence{}); - Float p_in_reg[in_reg_desc.GetElementSpace()]; - Float p_wei_reg[wei_reg_desc.GetElementSpace()]; + Data p_in_reg[in_reg_desc.GetElementSpace()]; + Data p_wei_reg[wei_reg_desc.GetElementSpace()]; constexpr unsigned in_w_new_read = 1; diff --git a/src/include/threadwise_nd_tensor_op.hip.hpp b/src/include/threadwise_nd_tensor_op.hip.hpp index c787afae77..510525db5d 100644 --- a/src/include/threadwise_nd_tensor_op.hip.hpp +++ b/src/include/threadwise_nd_tensor_op.hip.hpp @@ -10,7 +10,7 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc, SrcOpLengths, Number) { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::MemoryType; static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 && SrcOpLengths::nDim == 6, @@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc, SrcOpLengths, Number) { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::MemoryType; static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 && SrcOpLengths::nDim == 8,