From 4b616aad52807740908071e90e06e184d3177357 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 5 Feb 2019 00:51:37 -0600 Subject: [PATCH] refactor --- driver/conv.cu | 39 +- ...icit_gemm_convolution_1_chwn_csrk_khwn.cuh | 2 +- ...m_convolution_1_chwn_csrk_khwn_padded.cuh} | 76 ++-- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 168 +++++++++ ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 45 ++- src/include/blockwise_2d_tensor_op.cuh | 37 ++ ...m_convolution_1_chwn_csrk_khwn_padded.cuh} | 54 ++- ...n_1_chwn_csrk_khwn_padded_lds_pipeline.cuh | 353 ++++++++++++++++++ ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 293 +++++++++++++++ ...volution_2_cnhw_csrk_knhw_lds_pipeline.cuh | 339 +++++++++++++++++ ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 2 +- ...olution_2_cnhw_srck_knhw_lds_pipeline.cuh} | 14 +- 12 files changed, 1338 insertions(+), 84 deletions(-) rename driver/{device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh => device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh} (78%) create mode 100644 driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh rename src/include/{gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh => gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh} (84%) create mode 100644 src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh create mode 100644 src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh create mode 100644 src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh rename src/include/{gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh => gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh} (96%) diff --git a/driver/conv.cu b/driver/conv.cu index 04ab123266..b43779d974 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -11,8 +11,9 @@ #include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh" #include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" -#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh" +#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" #include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" +#include "device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh" //#include "device_winograd_convolution.cuh" struct GeneratorTensor_1 @@ -382,11 +383,14 @@ int main() #if 0 constexpr unsigned N = 1; constexpr unsigned C = 1; - constexpr unsigned HI = 10; - constexpr unsigned WI = 10; + constexpr unsigned HI = 28; + constexpr unsigned WI = 28; constexpr unsigned K = 1; constexpr unsigned S = 3; constexpr unsigned R = 3; + + constexpr unsigned HPad = 1; + constexpr unsigned WPad = 1; #elif 0 // 3x3, 34x34 constexpr unsigned N = 64; @@ -567,6 +571,9 @@ int main() #elif 1 in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); +#elif 1 + in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread); + wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #endif unsigned nrepeat = 100; @@ -582,26 +589,28 @@ int main() device_implicit_gemm_convolution_1_nchw_srck_nkhw #elif 0 device_implicit_gemm_convolution_1_chwn_csrk_khwn -#elif 1 +#elif 0 device_implicit_gemm_convolution_2_cnhw_srck_knhw +#elif 1 + device_implicit_gemm_convolution_2_cnhw_csrk_knhw #elif 0 device_winograd_convolution #endif (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(in_nchw_desc, - in_nchw, - wei_kcsr_desc, - wei_kcsr, - out_nkhw_desc, - out_nkhw_device, - lower_pads, - upper_pads, - nrepeat); +#elif 1 + device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(in_nchw_desc, + in_nchw, + wei_kcsr_desc, + wei_kcsr, + out_nkhw_desc, + out_nkhw_device, + lower_pads, + upper_pads, + nrepeat); #endif -#if 1 +#if 0 if(S == 3 && R == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index 157219601e..7bf43cf2a7 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -226,7 +226,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, cudaEventElapsedTime(&elapsedTime, start, stop); printf("Elapsed time : %f ms\n", elapsedTime); - usleep(10000); + usleep(std::min(elapsedTime * 1000, float(10000))); } checkCudaErrors(cudaGetLastError()); diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh similarity index 78% rename from driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh rename to driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh index 46aa80fbe2..d1699d1fbb 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh @@ -1,17 +1,19 @@ #pragma once -#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh" +#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" +#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh" #include +#include template -void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, - const Tensor& in_nchw, - WeiDesc, - const Tensor& wei_kcsr, - OutDesc, - Tensor& out_nkhw, - LowerPads, - UpperPads, - unsigned nrepeat) +void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcsr, + OutDesc, + Tensor& out_nkhw, + LowerPads, + UpperPads, + unsigned nrepeat) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -88,6 +90,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, constexpr unsigned HoPerThread = 1; constexpr unsigned WoPerThread = 1; + constexpr unsigned WeiBlockCopyThreadPerDim0 = 1; + constexpr unsigned WeiBlockCopyThreadPerDim1 = 1; + constexpr unsigned BlockSize = 8; #elif 0 // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256 @@ -180,6 +185,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, constexpr unsigned HoPerThread = 1; constexpr unsigned WoPerThread = 1; + constexpr unsigned WeiBlockCopyThreadPerDim0 = 2; + constexpr unsigned WeiBlockCopyThreadPerDim1 = 64; + constexpr unsigned BlockSize = 128; #elif 0 // for 5x5 filter, 20x84 image, 1x1 padding @@ -225,6 +233,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, constexpr unsigned HoPerThread = 1; constexpr unsigned WoPerThread = 1; + constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; + constexpr unsigned WeiBlockCopyThreadPerDim1 = 32; + constexpr unsigned BlockSize = 128; #endif @@ -245,24 +256,31 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, cudaEventCreate(&start); cudaEventRecord(start, 0); - gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding +#if 1 + gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded +#elif 1 + gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline +#endif + <<>>(static_cast(in_chwn_device_buf.GetDeviceBuffer()), static_cast(wei_csrk_device_buf.GetDeviceBuffer()), static_cast(out_khwn_device_buf.GetDeviceBuffer())); @@ -274,7 +292,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, cudaEventElapsedTime(&elapsedTime, start, stop); printf("Elapsed time : %f ms\n", elapsedTime); - usleep(elapsedTime * 1000); + usleep(std::min(elapsedTime * 1000, float(10000))); } checkCudaErrors(cudaGetLastError()); diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh new file mode 100644 index 0000000000..f17411e68b --- /dev/null +++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -0,0 +1,168 @@ +#pragma once +#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh" +#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh" +#include + +template +void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcsr, + OutDesc, + Tensor& out_nkhw, + unsigned nrepeat) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_nchw_desc = InDesc{}; + constexpr auto wei_kcsr_desc = WeiDesc{}; + constexpr auto out_nkhw_desc = OutDesc{}; + + constexpr unsigned N = in_nchw_desc.GetLength(I0); + constexpr unsigned Hi = in_nchw_desc.GetLength(I2); + constexpr unsigned Wi = in_nchw_desc.GetLength(I3); + + constexpr unsigned Ho = out_nkhw_desc.GetLength(I2); + constexpr unsigned Wo = out_nkhw_desc.GetLength(I3); + + constexpr unsigned K = wei_kcsr_desc.GetLength(I0); + constexpr unsigned C = wei_kcsr_desc.GetLength(I1); + constexpr unsigned S = wei_kcsr_desc.GetLength(I2); + constexpr unsigned R = wei_kcsr_desc.GetLength(I3); + + constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1); + + // convert in_nchw to in_cnhw + auto in_cnhw_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(in_cnhw_desc, std::cout << "in_cnhw_desc: "); + + Tensor in_cnhw(make_TensorDescriptor(in_cnhw_desc)); + + auto f_reorder_nchw2cnhw = [&](auto n, auto c, auto hi, auto wi) { + in_cnhw(c, n, hi, wi) = in_nchw(n, c, hi, wi); + }; + + make_ParallelTensorFunctor(f_reorder_nchw2cnhw, N, C, Hi, Wi)( + std::thread::hardware_concurrency()); + + // convert wei_kcsr to wei_csrk + auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: "); + + Tensor wei_csrk(make_TensorDescriptor(wei_csrk_desc)); + + auto f_reorder_kcsr2csrk = [&](auto k, auto c, auto s, auto r) { + wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r); + }; + + make_ParallelTensorFunctor(f_reorder_kcsr2csrk, K, C, S, R)( + std::thread::hardware_concurrency()); + + // conver out_nkhw to out_knhw + auto out_knhw_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(out_knhw_desc, std::cout << "out_knhw_desc: "); + + Tensor out_knhw(make_TensorDescriptor(out_knhw_desc)); + +#if 1 + // 1x1, 28x28 + constexpr unsigned BPerBlock = 64; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 8; + + constexpr unsigned BPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + + constexpr unsigned GemmRowThreadPerCluster = 4; + constexpr unsigned GemmColumnThreadPerCluster = 8; + + constexpr unsigned InBlockCopyThreadPerDim0 = 4; + constexpr unsigned InBlockCopyThreadPerDim1 = 16; + + constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; + constexpr unsigned WeiBlockCopyThreadPerDim1 = 16; + + constexpr unsigned BlockSize = 64; +#endif + + constexpr unsigned GridSize = + ((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock); + + dim3 block_dim(BlockSize); + dim3 grid_dim(GridSize); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + // mem + std::size_t data_sz = sizeof(T); + DeviceMem in_cnhw_device_buf(data_sz * (in_cnhw.mDesc.GetElementSpace() + BGhostRead + + BPerBlock)); // reserve extra space for BGhostRead + DeviceMem wei_csrk_device_buf(data_sz * wei_csrk.mDesc.GetElementSpace()); + DeviceMem out_knhw_device_buf(data_sz * out_knhw.mDesc.GetElementSpace()); + + in_cnhw_device_buf.ToDevice(in_cnhw.mData.data()); + wei_csrk_device_buf.ToDevice(wei_csrk.mData.data()); + out_knhw_device_buf.ToDevice(out_knhw.mData.data()); + + for(unsigned i = 0; i < nrepeat; ++i) + { + cudaEvent_t start, stop; + float elapsedTime; + cudaEventCreate(&start); + cudaEventRecord(start, 0); + +#if 1 + gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw +#else + gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline +#endif + + <<>>(in_cnhw_desc, + static_cast(in_cnhw_device_buf.GetDeviceBuffer()), + wei_csrk_desc, + static_cast(wei_csrk_device_buf.GetDeviceBuffer()), + out_knhw_desc, + static_cast(out_knhw_device_buf.GetDeviceBuffer())); + + cudaEventCreate(&stop); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Elapsed time : %f ms\n", elapsedTime); + + usleep(std::min(elapsedTime * 1000, float(10000))); + } + + checkCudaErrors(cudaGetLastError()); + out_knhw_device_buf.FromDevice(out_knhw.mData.data()); + + // convert out_knhw to out_nkhw + auto f_reorder_knhw2nkhw = [&](auto n, auto k, auto ho, auto wo) { + out_nkhw(n, k, ho, wo) = out_knhw(k, n, ho, wo); + }; + + make_ParallelTensorFunctor(f_reorder_knhw2nkhw, N, K, Ho, Wo)( + std::thread::hardware_concurrency()); +} diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index f9b0394e03..e995baa615 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -1,6 +1,6 @@ #pragma once #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" -#include "gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh" +#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh" #include template @@ -122,8 +122,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, constexpr unsigned GemmRowThreadPerCluster = 8; constexpr unsigned GemmColumnThreadPerCluster = 8; - constexpr unsigned InBlockCopyThreadPerDim0 = 2; - constexpr unsigned InBlockCopyThreadPerDim1 = 64; + constexpr unsigned InBlockCopyThreadPerDim0 = 8; + constexpr unsigned InBlockCopyThreadPerDim1 = 16; constexpr unsigned BlockSize = 128; #endif @@ -154,22 +154,27 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, cudaEventCreate(&start); cudaEventRecord(start, 0); - gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw +#if 0 + gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw +#else + gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline +#endif + <<>>(in_cnhw_desc, static_cast(in_cnhw_device_buf.GetDeviceBuffer()), wei_srck_desc, @@ -184,7 +189,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, cudaEventElapsedTime(&elapsedTime, start, stop); printf("Elapsed time : %f ms\n", elapsedTime); - usleep(10000); + usleep(std::min(elapsedTime * 1000, float(10000))); } checkCudaErrors(cudaGetLastError()); diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.cuh index 5d347de8ef..cbc42e01d0 100644 --- a/src/include/blockwise_2d_tensor_op.cuh +++ b/src/include/blockwise_2d_tensor_op.cuh @@ -187,6 +187,8 @@ struct blockwise_2d_tensor_copy_2 __device__ blockwise_2d_tensor_copy_2() { + static_assert(is_same::value, "wrong! type is not float!\n"); + mThreadId0 = get_thread_local_1d_id() / ThreadPerDim1; mThreadId1 = get_thread_local_1d_id() - mThreadId0 * ThreadPerDim1; } @@ -225,7 +227,14 @@ struct blockwise_2d_tensor_copy_2 for(unsigned d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop) { unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; +#if 1 + const unsigned sindex = src_desc.Get1dIndex(did0, did1); + const unsigned dindex = dst_desc.Get1dIndex(did0, did1); + *(reinterpret_cast(p_dst + dindex)) = + *(reinterpret_cast(p_src + sindex)); + +#else for(unsigned i = 0; i < 4; ++i) { const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); @@ -233,6 +242,7 @@ struct blockwise_2d_tensor_copy_2 p_dst[dindex] = p_src[sindex]; } +#endif } // v2 @@ -241,6 +251,14 @@ struct blockwise_2d_tensor_copy_2 unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1; +#if 1 + const unsigned sindex = src_desc.Get1dIndex(did0, did1); + const unsigned dindex = dst_desc.Get1dIndex(did0, did1); + + *(reinterpret_cast(p_dst + dindex)) = + *(reinterpret_cast(p_src + sindex)); + +#else for(unsigned i = 0; i < 2; ++i) { const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); @@ -248,6 +266,7 @@ struct blockwise_2d_tensor_copy_2 p_dst[dindex] = p_src[sindex]; } +#endif } // v1 @@ -291,6 +310,14 @@ struct blockwise_2d_tensor_copy_2 { unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; +#if 1 + const unsigned sindex = src_desc.Get1dIndex(did0, did1); + const unsigned dindex = dst_desc.Get1dIndex(did0, did1); + + *(reinterpret_cast(p_dst + dindex)) = + *(reinterpret_cast(p_src + sindex)); + +#else for(unsigned i = 0; i < 4; ++i) { const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); @@ -298,6 +325,7 @@ struct blockwise_2d_tensor_copy_2 p_dst[dindex] = p_src[sindex]; } +#endif } // v2 @@ -306,6 +334,14 @@ struct blockwise_2d_tensor_copy_2 unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1; +#if 1 + const unsigned sindex = src_desc.Get1dIndex(did0, did1); + const unsigned dindex = dst_desc.Get1dIndex(did0, did1); + + *(reinterpret_cast(p_dst + dindex)) = + *(reinterpret_cast(p_src + sindex)); + +#else for(unsigned i = 0; i < 2; ++i) { const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); @@ -313,6 +349,7 @@ struct blockwise_2d_tensor_copy_2 p_dst[dindex] = p_src[sindex]; } +#endif } // v1 diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh similarity index 84% rename from src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh rename to src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh index b2f428e4e5..5bb1f67579 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh @@ -3,6 +3,7 @@ #include "ConstantTensorDescriptor.cuh" #include "ConstantMatrixDescriptor.cuh" #include "blockwise_4d_tensor_op.cuh" +#include "blockwise_2d_tensor_op.cuh" #include "threadwise_4d_tensor_op.cuh" #include "blockwise_gemm.cuh" @@ -23,11 +24,13 @@ template -__global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding( - Float* const __restrict__ p_in_global, - Float* const __restrict__ p_wei_global, - Float* __restrict__ p_out_global) + unsigned WoPerThread, + unsigned WeiBlockCopyThreadPerDim0, + unsigned WeiBlockCopyThreadPerDim1> +__global__ void +gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restrict__ p_in_global, + Float* const __restrict__ p_wei_global, + Float* __restrict__ p_out_global) { // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" @@ -82,6 +85,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; const unsigned n_block_data_begin = n_block_work_id * NPerBlock; + // flattened (2d) tensor view of wei in global mem + constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); + // tensor view of blockwise input and weight in LDS constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor(Sequence{}); @@ -89,6 +95,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor(Sequence{}); + // flattened (2d) tensor view of wei in LDS + constexpr auto wei_ek_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + // tensor view of threadwise output in register constexpr auto out_hkwn_thread_desc = make_ConstantTensorDescriptor(Sequence{}); @@ -133,13 +143,33 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding decltype(in_chwn_block_desc.GetLengths()), LowerPads>{}; - // weight: format is [S,R,C,K] +#if 1 + // weight: format is [C,S,R,K] constexpr auto blockwise_wei_copy = blockwise_4d_tensor_copy_1{}; +#elif 1 + // weight: format is [C*S*R,K] + constexpr auto blockwise_wei_copy = + blockwise_2d_tensor_copy_1{}; +#elif 1 + // weight: format is [C*S*R,K] + const auto blockwise_wei_copy = + blockwise_2d_tensor_copy_2{}; +#endif // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -190,8 +220,12 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding // set threadwise output tensor to 0 threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread); - for(unsigned c_block_data_begin = 0; c_block_data_begin < C; - c_block_data_begin += CPerBlock, __syncthreads()) + Float* p_wei_global_block_begin = + p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin); + + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, + p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0), + __syncthreads()) { #if 1 // input: global mem to LDS, @@ -209,9 +243,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding #if 1 // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global + wei_csrk_global_desc.Get1dIndex( - c_block_data_begin, 0, 0, k_block_data_begin), - p_wei_block); + blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block); #endif __syncthreads(); diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh new file mode 100644 index 0000000000..c92b500f72 --- /dev/null +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh @@ -0,0 +1,353 @@ +#pragma once +#include "common.cuh" +#include "ConstantTensorDescriptor.cuh" +#include "ConstantMatrixDescriptor.cuh" +#include "blockwise_4d_tensor_op.cuh" +#include "blockwise_2d_tensor_op.cuh" +#include "threadwise_4d_tensor_op.cuh" +#include "blockwise_gemm.cuh" + +template +__global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline( + Float* const __restrict__ p_in_global, + Float* const __restrict__ p_wei_global, + Float* __restrict__ p_out_global) +{ + // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] + // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" + // if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock + static_assert(NPerBlock % NPerThread == 0, "wrong! NPerBlock % NPerThread !=0"); + static_assert((NPerThread < NPerBlock && WoPerThread == 1) || NPerThread == NPerBlock, + "wrong!"); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_chwn_global_desc = InGlobalDesc{}; + constexpr auto wei_csrk_global_desc = WeiGlobalDesc{}; + constexpr auto out_khwn_global_desc = OutGlobalDesc{}; + + constexpr unsigned C = in_chwn_global_desc.GetLength(I0); + + constexpr unsigned K = out_khwn_global_desc.GetLength(I0); + constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1); + constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2); + constexpr unsigned N = out_khwn_global_desc.GetLength(I3); + + constexpr unsigned S = wei_csrk_global_desc.GetLength(I1); + constexpr unsigned R = wei_csrk_global_desc.GetLength(I2); + + constexpr unsigned HPadLow = LowerPads{}.Get(I0); + constexpr unsigned WPadLow = LowerPads{}.Get(I1); + + constexpr unsigned HPadUp = UpperPads{}.Get(I0); + constexpr unsigned WPadUp = UpperPads{}.Get(I1); + + constexpr unsigned HiPerBlock = HoPerBlock + S - 1; + constexpr unsigned WiPerBlock = WoPerBlock + R - 1; + + // divide block work: [K, Ho, Wo, N] + constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock; + constexpr unsigned HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock; + constexpr unsigned WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock; + constexpr unsigned NBlockWork = (N + NPerBlock - 1) / NPerBlock; + + const unsigned k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork); + unsigned itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork); + const unsigned h_block_work_id = itmp / (WBlockWork * NBlockWork); + itmp -= h_block_work_id * (WBlockWork * NBlockWork); + const unsigned w_block_work_id = itmp / NBlockWork; + const unsigned n_block_work_id = itmp - w_block_work_id * NBlockWork; + + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock; + const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; + const unsigned n_block_data_begin = n_block_work_id * NPerBlock; + + // flattened (2d) tensor view of wei in global mem + constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of blockwise input and weight in LDS + constexpr auto in_chwn_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_csrk_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + // flattened (2d) tensor view of wei in LDS + constexpr auto wei_ek_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of threadwise output in register + constexpr auto out_hkwn_thread_desc = + make_ConstantTensorDescriptor(Sequence{}); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc"); + print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc"); + print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc"); + } +#endif + + // blockwise copy + // input: format is [C, Hi, Wi, N] + const unsigned h_block_pad_low = h_block_work_id == 0 ? HPadLow : 0; + const unsigned w_block_pad_low = w_block_work_id == 0 ? WPadLow : 0; + + const unsigned h_block_pad_up = h_block_work_id == HBlockWork - 1 ? HPadUp : 0; + const unsigned w_block_pad_up = w_block_work_id == WBlockWork - 1 ? WPadUp : 0; + +#if 0 + if(get_thread_local_1d_id() == 0) + ; + { + printf( + "%u %u, h_block_pad_low %u w_block_pad_low %u h_block_pad_up %u w_block_pad_up %u\n", + get_block_1d_id(), + get_thread_local_1d_id(), + h_block_pad_low, + w_block_pad_low, + h_block_pad_up, + w_block_pad_up); + } +#endif + + constexpr auto blockwise_in_copy = + blockwise_chwn_tensor_copy_with_padding{}; + +#if 0 + // weight: format is [C,S,R,K] + constexpr auto blockwise_wei_copy = + blockwise_4d_tensor_copy_1{}; +#elif 0 + // weight: format is [C*S*R,K] + constexpr auto blockwise_wei_copy = + blockwise_2d_tensor_copy_1{}; +#elif 1 + // weight: format is [C*S*R,K] + const auto blockwise_wei_copy = + blockwise_2d_tensor_copy_2{}; +#endif + + // a series of blockwise batched GEMM + // C_matrix += transpose(A_matrix) * B_matrix + // A_matrix and B_matrix saved in LDS, C_matrix saved in register + // A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K] + // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] + // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] + const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, + Number{}, + Number{}); // constexpr doesn't compile + + const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, + Number{}, + Number{}); // constexpr doesn't compile + + const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}); // constexpr doesn't compile + + const auto blockwise_batch_gemm = + blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c{}; + + // LDS + constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace(); + constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace(); + + // LDS double buffer + __shared__ Float p_in_block_0[in_block_size]; + __shared__ Float p_wei_block_0[wei_block_size]; + + __shared__ Float p_in_block_1[in_block_size]; + __shared__ Float p_wei_block_1[wei_block_size]; + + // register + Float p_out_thread[out_hkwn_thread_desc.GetElementSpace()]; + + // set threadwise output tensor to 0 + threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread); + + Float* p_wei_global_block_begin = + p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin); + + // prelog: load data + // input: global mem to LDS, + blockwise_in_copy.run(p_in_global, + 0, + ho_block_data_begin, + wo_block_data_begin, + n_block_data_begin, + p_in_block_0, + h_block_pad_low, + w_block_pad_low, + h_block_pad_up, + w_block_pad_up); + + // weight: global mem to LDS, + blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block_0); + + p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0); + + bool even_loop = true; + + for(unsigned c_block_data_begin = CPerBlock; c_block_data_begin < C; + c_block_data_begin += CPerBlock, + p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0), + even_loop = !even_loop) + { + __syncthreads(); + + Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1; + Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1; + + Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0; + Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0; + + // preload next data +#if 1 + // input: global mem to LDS, + blockwise_in_copy.run(p_in_global, + c_block_data_begin, + ho_block_data_begin, + wo_block_data_begin, + n_block_data_begin, + p_in_block_next, + h_block_pad_low, + w_block_pad_low, + h_block_pad_up, + w_block_pad_up); +#endif + +#if 1 + // weight: global mem to LDS, + blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block_next); +#endif + + // a series of batched GEMM + for(unsigned s = 0; s < S; ++s) + { + for(unsigned r = 0; r < R; ++r) + { + auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; + + blockwise_batch_gemm.run(p_wei_block_now + + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), + p_in_block_now + in_chwn_block_desc.Get1dIndex(0, s, r, 0), + p_out_thread, + f_accum); + } + } + } + + // last computation + { + __syncthreads(); + + Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1; + Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1; + + // a series of batched GEMM + for(unsigned s = 0; s < S; ++s) + { + for(unsigned r = 0; r < R; ++r) + { + auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; + + blockwise_batch_gemm.run(p_wei_block_now + + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), + p_in_block_now + in_chwn_block_desc.Get1dIndex(0, s, r, 0), + p_out_thread, + f_accum); + } + } + } + + const auto matrix_c_index = + blockwise_batch_gemm.CalculateThreadMatrixCIndex(get_thread_local_1d_id()); + + const unsigned ho_thread_data_begin = matrix_c_index.batch_begin; + const unsigned k_thread_data_begin = matrix_c_index.row_begin; + const unsigned wo_thread_data_begin = matrix_c_index.col_begin / NPerBlock; + const unsigned n_thread_data_begin = + matrix_c_index.col_begin - wo_thread_data_begin * NPerBlock; + +#if 0 + printf("block %u %u, %u %u %u %u, %u %u %u %u, %f \n", + get_block_1d_id(), get_thread_local_1d_id(), + ho_block_data_begin, k_block_data_begin, wo_block_data_begin, n_block_data_begin, + ho_thread_data_begin, k_thread_data_begin, wo_thread_data_begin, n_thread_data_begin, + p_out_thread[0]); +#endif + + // output: register to global mem, + // convert out_thread[Ho,K,Wo,N] to out_global[K,Ho,Wo,N] + constexpr auto reorder_khwn_from_hkwn = Sequence<1, 0, 2, 3>{}; + + threadwise_4d_tensor_copy_reorder_by_get_dst_from_src( + out_hkwn_thread_desc, + p_out_thread, + out_khwn_global_desc, + p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_hkwn_thread_desc.GetLengths(), + reorder_khwn_from_hkwn); +} diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh new file mode 100644 index 0000000000..49e4643794 --- /dev/null +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -0,0 +1,293 @@ +#pragma once +#include "common.cuh" +#include "ConstantTensorDescriptor.cuh" +#include "ConstantMatrixDescriptor.cuh" +#include "blockwise_4d_tensor_op.cuh" +#include "blockwise_2d_tensor_op.cuh" +#include "threadwise_2d_tensor_op.cuh" +#include "blockwise_gemm.cuh" + +// define B = flatten(N, Hi, Wi) +template +__global__ void +gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, + Float* const __restrict__ p_in_global, + WeiGlobalDesc, + Float* const __restrict__ p_wei_global, + OutGlobalDesc, + Float* __restrict__ p_out_global) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_cnhw_global_desc = InGlobalDesc{}; + constexpr auto wei_csrk_global_desc = WeiGlobalDesc{}; + constexpr auto out_knhw_global_desc = OutGlobalDesc{}; + + constexpr unsigned C = in_cnhw_global_desc.GetLength(I0); + constexpr unsigned N = in_cnhw_global_desc.GetLength(I1); + constexpr unsigned Hi = in_cnhw_global_desc.GetLength(I2); + constexpr unsigned Wi = in_cnhw_global_desc.GetLength(I3); + + constexpr unsigned K = out_knhw_global_desc.GetLength(I0); + constexpr unsigned Ho = out_knhw_global_desc.GetLength(I2); + constexpr unsigned Wo = out_knhw_global_desc.GetLength(I3); + + constexpr unsigned S = wei_csrk_global_desc.GetLength(I1); + constexpr unsigned R = wei_csrk_global_desc.GetLength(I2); + + constexpr unsigned B = N * Hi * Wi; + constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1); + + // divide block work by 2d: [K, B] + constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock; + constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock; + + const unsigned k_block_work_id = get_block_1d_id() / BBlockWork; + const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork; + + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned b_block_data_begin = b_block_work_id * BPerBlock; + +#if 0 + if(get_thread_local_1d_id() == 0) + { + printf("K %u B %u, BGhostRead %u\n", K, B, BGhostRead); + + printf("%u %u, KBlockWork %u BBlockWork %u, k_block_data_begin %u b_block_data_begin %u\n", + get_block_1d_id(), + get_thread_local_1d_id(), + KBlockWork, + BBlockWork, + k_block_data_begin, + b_block_data_begin); + } +#endif + + // flattend (2d) tensor view of gridwise input + constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of blockwise input and weight + constexpr auto in_cb_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_ek_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_csrk_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of threadwise output in register + constexpr auto out_kb_thread_desc = + make_ConstantTensorDescriptor(Sequence{}); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); + print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc"); + print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); + + printf("KPerBlock %u\n", KPerBlock); + } +#endif + + // blockwise in copy + // formmat is [CPerBlock,BPerBlock + BGhostRead] +#if 0 + const auto blockwise_in_copy = + blockwise_2d_tensor_copy_1{}; +#elif 1 + const auto blockwise_in_copy = + blockwise_2d_tensor_copy_2{}; +#endif + + // blockwise wei copy + // format is [CPerBlock*S*R,KPerBlock] +#if 0 + const auto blockwise_wei_copy = + blockwise_2d_tensor_copy_1{}; +#elif 1 + const auto blockwise_wei_copy = + blockwise_2d_tensor_copy_2{}; +#endif + + // a series of blockwise GEMM + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx and b_mtx saved in LDS, c_mtx saved in register + // a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K] + // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] + // c_mtx[K,B] is out_block[K,B] + const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}); // constexpr doesn't compile + + const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, + Number{}, + Number{}); // constexpr doesn't compile + + const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}); // constexpr doesn't compile + + const auto blockwise_gemm = + blockwise_gemm_block_a_block_b_thread_c{}; + + // LDS + constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace(); + constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace(); + + __shared__ Float p_in_block[in_block_size]; + __shared__ Float p_wei_block[wei_block_size]; + + // register + Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; + + // set threadwise output tensor to 0 + threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); + + Float* p_in_global_block_offset = + p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); + + Float* p_wei_global_block_offset = + p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + + for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0), + p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I2), + __syncthreads()) + { + // input: global mem to LDS, + blockwise_in_copy.run(p_in_global_block_offset, p_in_block); + + // weight: global mem to LDS, + blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block); + + __syncthreads(); + + // a series of GEMM + for(unsigned s = 0; s < S; ++s) + { + for(unsigned r = 0; r < R; ++r) + { + auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; + + blockwise_gemm.run(p_wei_block + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), + p_in_block + s * Wi + r, + p_out_thread, + f_accum); + } + } + } + + // output: register to global mem, + const auto matrix_c_index = + blockwise_gemm.CalculateThreadMatrixCIndex(get_thread_local_1d_id()); + + const unsigned k_thread_data_begin = matrix_c_index.row_begin; + const unsigned b_thread_data_begin = matrix_c_index.col_begin; + + const unsigned k_data_begin = k_block_data_begin + k_thread_data_begin; + const unsigned b_data_begin = b_block_data_begin + b_thread_data_begin; + +#if 0 + if(get_block_1d_id() == 0) + { + printf("%u %u, row_begin %u col_begin %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n", + get_block_1d_id(), + get_thread_local_1d_id(), + matrix_c_index.row_begin, + matrix_c_index.col_begin, + k_data_begin, + b_data_begin, + p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]); + } +#endif + + for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k) + { + for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b) + { + unsigned k_data = k_data_begin + k; + unsigned b_data = b_data_begin + b; + + unsigned n_data = b_data / (Hi * Wi); + unsigned itmp = b_data - n_data * (Hi * Wi); + unsigned h_data = itmp / Wi; + unsigned w_data = itmp - h_data * Wi; + +#if 0 + if(get_block_1d_id() == 0) + { + printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n", + get_block_1d_id(), + get_thread_local_1d_id(), + k, + b, + k_data, + n_data, + h_data, + w_data, + p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]); + } +#endif + if(n_data < N && h_data < Ho && w_data < Wo) + { +#if 1 + p_out_global[out_knhw_global_desc.Get1dIndex(k_data, n_data, h_data, w_data)] = + p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; +#endif + } + } + } +} diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh new file mode 100644 index 0000000000..78c0d5a419 --- /dev/null +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh @@ -0,0 +1,339 @@ +#pragma once +#include "common.cuh" +#include "ConstantTensorDescriptor.cuh" +#include "ConstantMatrixDescriptor.cuh" +#include "blockwise_4d_tensor_op.cuh" +#include "blockwise_2d_tensor_op.cuh" +#include "threadwise_2d_tensor_op.cuh" +#include "blockwise_gemm.cuh" + +// define B = flatten(N, Hi, Wi) +template +__global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline( + InGlobalDesc, + Float* const __restrict__ p_in_global, + WeiGlobalDesc, + Float* const __restrict__ p_wei_global, + OutGlobalDesc, + Float* __restrict__ p_out_global) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_cnhw_global_desc = InGlobalDesc{}; + constexpr auto wei_csrk_global_desc = WeiGlobalDesc{}; + constexpr auto out_knhw_global_desc = OutGlobalDesc{}; + + constexpr unsigned C = in_cnhw_global_desc.GetLength(I0); + constexpr unsigned N = in_cnhw_global_desc.GetLength(I1); + constexpr unsigned Hi = in_cnhw_global_desc.GetLength(I2); + constexpr unsigned Wi = in_cnhw_global_desc.GetLength(I3); + + constexpr unsigned K = out_knhw_global_desc.GetLength(I0); + constexpr unsigned Ho = out_knhw_global_desc.GetLength(I2); + constexpr unsigned Wo = out_knhw_global_desc.GetLength(I3); + + constexpr unsigned S = wei_csrk_global_desc.GetLength(I1); + constexpr unsigned R = wei_csrk_global_desc.GetLength(I2); + + constexpr unsigned B = N * Hi * Wi; + constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1); + + // divide block work by 2d: [K, B] + constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock; + constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock; + + const unsigned k_block_work_id = get_block_1d_id() / BBlockWork; + const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork; + + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned b_block_data_begin = b_block_work_id * BPerBlock; + +#if 0 + if(get_thread_local_1d_id() == 0) + { + printf("K %u B %u, BGhostRead %u\n", K, B, BGhostRead); + + printf("%u %u, KBlockWork %u BBlockWork %u, k_block_data_begin %u b_block_data_begin %u\n", + get_block_1d_id(), + get_thread_local_1d_id(), + KBlockWork, + BBlockWork, + k_block_data_begin, + b_block_data_begin); + } +#endif + + // flattend (2d) tensor view of gridwise input + constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of blockwise input and weight + constexpr auto in_cb_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_ek_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_csrk_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + // tensor view of threadwise output in register + constexpr auto out_kb_thread_desc = + make_ConstantTensorDescriptor(Sequence{}); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); + print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc"); + print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); + + printf("KPerBlock %u\n", KPerBlock); + } +#endif + + // blockwise in copy + // formmat is [CPerBlock,BPerBlock + BGhostRead] +#if 0 + const auto blockwise_in_copy = + blockwise_2d_tensor_copy_1{}; +#elif 1 + const auto blockwise_in_copy = + blockwise_2d_tensor_copy_2{}; +#endif + + // blockwise wei copy + // format is [CPerBlock*S*R,KPerBlock] +#if 0 + const auto blockwise_wei_copy = + blockwise_2d_tensor_copy_1{}; +#elif 1 + const auto blockwise_wei_copy = + blockwise_2d_tensor_copy_2{}; +#endif + + // a series of blockwise GEMM + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx and b_mtx saved in LDS, c_mtx saved in register + // a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K] + // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] + // c_mtx[K,B] is out_block[K,B] + const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}); // constexpr doesn't compile + + const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, + Number{}, + Number{}); // constexpr doesn't compile + + const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}); // constexpr doesn't compile + + const auto blockwise_gemm = + blockwise_gemm_block_a_block_b_thread_c{}; + + // LDS + constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace(); + constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace(); + + // LDS double buffer + __shared__ Float p_in_block_0[in_block_size]; + __shared__ Float p_wei_block_0[wei_block_size]; + + __shared__ Float p_in_block_1[in_block_size]; + __shared__ Float p_wei_block_1[wei_block_size]; + + // register + Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; + + Float* p_in_global_block_offset = + p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); + + Float* p_wei_global_block_offset = + p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + + // prelog : preload data + // input: global mem to LDS, + blockwise_in_copy.run(p_in_global_block_offset, p_in_block_0); + + // weight: global mem to LDS, + blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block_0); + + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); + + p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I2); + + // set threadwise output tensor to 0 + threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); + + bool even_loop = true; + + for(unsigned c_block_data_begin = CPerBlock; c_block_data_begin < C; + c_block_data_begin += CPerBlock, + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0), + p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I2), + even_loop = !even_loop) + { + __syncthreads(); + + Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1; + Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1; + + Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0; + Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0; + + // input: global mem to LDS, + blockwise_in_copy.run(p_in_global_block_offset, p_in_block_next); + + // weight: global mem to LDS, + blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block_next); + + // a series of GEMM + for(unsigned s = 0; s < S; ++s) + { + for(unsigned r = 0; r < R; ++r) + { + auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; + + blockwise_gemm.run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), + p_in_block_now + s * Wi + r, + p_out_thread, + f_accum); + } + } + } + + // last computation + { + __syncthreads(); + + Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1; + Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1; + + // a series of GEMM + for(unsigned s = 0; s < S; ++s) + { + for(unsigned r = 0; r < R; ++r) + { + auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; + + blockwise_gemm.run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), + p_in_block_now + s * Wi + r, + p_out_thread, + f_accum); + } + } + } + + // output: register to global mem, + const auto matrix_c_index = + blockwise_gemm.CalculateThreadMatrixCIndex(get_thread_local_1d_id()); + + const unsigned k_thread_data_begin = matrix_c_index.row_begin; + const unsigned b_thread_data_begin = matrix_c_index.col_begin; + + const unsigned k_data_begin = k_block_data_begin + k_thread_data_begin; + const unsigned b_data_begin = b_block_data_begin + b_thread_data_begin; + +#if 0 + if(get_block_1d_id() == 0) + { + printf("%u %u, row_begin %u col_begin %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n", + get_block_1d_id(), + get_thread_local_1d_id(), + matrix_c_index.row_begin, + matrix_c_index.col_begin, + k_data_begin, + b_data_begin, + p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]); + } +#endif + + for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k) + { + for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b) + { + unsigned k_data = k_data_begin + k; + unsigned b_data = b_data_begin + b; + + unsigned n_data = b_data / (Hi * Wi); + unsigned itmp = b_data - n_data * (Hi * Wi); + unsigned h_data = itmp / Wi; + unsigned w_data = itmp - h_data * Wi; + +#if 0 + if(get_block_1d_id() == 0) + { + printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n", + get_block_1d_id(), + get_thread_local_1d_id(), + k, + b, + k_data, + n_data, + h_data, + w_data, + p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]); + } +#endif + if(n_data < N && h_data < Ho && w_data < Wo) + { +#if 1 + p_out_global[out_knhw_global_desc.Get1dIndex(k_data, n_data, h_data, w_data)] = + p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; +#endif + } + } + } +} diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index fb413cff15..883f00d120 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -115,7 +115,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, decltype(in_cb_global_desc), decltype(in_cb_block_desc), decltype(in_cb_block_desc.GetLengths())>{}; -#elif 0 +#elif 1 const auto blockwise_in_copy = blockwise_2d_tensor_copy_2 -__global__ void -gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw(InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) +__global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline( + InGlobalDesc, + Float* const __restrict__ p_in_global, + WeiGlobalDesc, + Float* const __restrict__ p_wei_global, + OutGlobalDesc, + Float* __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{};