From 88b77181aab1198b41b612f6d03b6dfb2d32bd40 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 11 Jun 2019 14:35:59 -0500 Subject: [PATCH] rename files, added header guard, added namespace --- ...e_convolution_direct_v2_nchw_kcyx_nkhw.hpp | 6 +- ...lution_implicit_gemm_v1_chwn_cyxk_khwn.hpp | 10 +- ...lution_implicit_gemm_v1_nchw_cyxk_khwn.hpp | 282 ---------- ...lution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp | 12 +- ...lution_implicit_gemm_v2_chwn_cyxk_khwn.hpp | 6 +- ...lution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp | 10 +- ...lution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp | 10 +- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 2 + ...mm_convolution_1_chwn_cyxk_khwn_padded.hpp | 2 + driver/driver.cpp | 14 +- src/CMakeLists.txt | 2 +- src/device.cpp | 2 +- src/include/Array.hpp | 23 +- src/include/ConstantMatrixDescriptor.hpp | 10 +- .../ConstantMergedTensorDescriptor.hpp | 9 +- src/include/ConstantTensorDescriptor.hpp | 26 +- src/include/Sequence.hpp | 9 +- src/include/amd_inline_asm.hpp | 9 +- src/include/blockwise_2d_tensor_op.hpp | 16 +- src/include/blockwise_3d_tensor_op.hpp | 26 +- src/include/blockwise_4d_tensor_op.hpp | 28 +- src/include/blockwise_batched_gemm.hpp | 11 +- src/include/blockwise_gemm.hpp | 11 +- ...> blockwise_generic_tensor_slice_copy.hpp} | 16 +- ...op.hpp => blockwise_tensor_slice_copy.hpp} | 31 +- src/include/common.hpp | 10 +- src/include/{config.h.in => config.hpp.in} | 25 +- src/include/conv_common.hpp | 8 +- src/include/device.hpp | 10 +- src/include/functional.hpp | 9 +- src/include/functional2.hpp | 9 +- src/include/functional3.hpp | 9 +- ...e_convolution_direct_v2_nchw_kcyx_nkhw.hpp | 11 +- ...tion_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp | 23 +- ...tion_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp | 21 +- ...tion_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp | 352 ------------ ...tion_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp | 29 +- ...v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp} | 31 +- ..._v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp | 501 ------------------ ...tion_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp | 462 ---------------- ...tion_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp | 38 +- ...v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp} | 40 +- ...lution_implicit_gemm_v2_chwn_cyxk_khwn.hpp | 11 +- ...mm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp | 13 +- ...lution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp | 21 +- ...m_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp} | 23 +- ...lution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp | 23 +- ...m_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp} | 25 +- .../gridwise_convolution_kernel_wrapper.hpp | 16 + src/include/gridwise_convolution_wrapper.hpp | 9 - ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 4 + ...mm_convolution_1_chwn_cyxk_khwn_padded.hpp | 4 + src/include/integral_constant.hpp | 8 +- src/include/tensor.hpp | 6 +- src/include/threadwise_4d_tensor_op.hpp | 9 +- src/include/threadwise_direct_convolution.hpp | 11 +- src/include/threadwise_gemm.hpp | 9 +- src/include/threadwise_generic_tensor_op.hpp | 19 + ... threadwise_generic_tensor_slice_copy.hpp} | 9 +- ...p.hpp => threadwise_tensor_slice_copy.hpp} | 9 +- src/include/{base.hpp => utility.hpp} | 12 +- src/include/vector_type.hpp | 12 +- 62 files changed, 580 insertions(+), 1844 deletions(-) delete mode 100644 driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp rename src/include/{blockwise_generic_tensor_slice_op.hpp => blockwise_generic_tensor_slice_copy.hpp} (98%) rename src/include/{blockwise_tensor_slice_op.hpp => blockwise_tensor_slice_copy.hpp} (94%) rename src/include/{config.h.in => config.hpp.in} (89%) delete mode 100644 src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp rename src/include/{gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp => gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp} (95%) delete mode 100644 src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp delete mode 100644 src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp rename src/include/{gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp => gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp} (94%) rename src/include/{gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp => gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp} (96%) rename src/include/{gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp => gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp} (96%) create mode 100644 src/include/gridwise_convolution_kernel_wrapper.hpp delete mode 100644 src/include/gridwise_convolution_wrapper.hpp create mode 100644 src/include/threadwise_generic_tensor_op.hpp rename src/include/{threadwise_generic_tensor_slice_op.hpp => threadwise_generic_tensor_slice_copy.hpp} (96%) rename src/include/{threadwise_tensor_slice_op.hpp => threadwise_tensor_slice_copy.hpp} (98%) rename src/include/{base.hpp => utility.hpp} (94%) diff --git a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp index 790bcfbb9a..8e1126bd77 100644 --- a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp +++ b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp @@ -1,9 +1,11 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp" +using namespace ck; + template void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc, const Tensor& in, @@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc, WoPerThread, InBlockCopyDataPerRead, WeiBlockCopyDataPerRead>; - float time = launch_kernel(run_gridwise_convolution, + float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), 0, diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 05e85f5bfb..4ffa1de4d9 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -1,11 +1,13 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp" #include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp" #include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp" + +using namespace ck; template void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, @@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, #elif 0 GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn #elif 1 - GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn + GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer #endif {}; - float time = launch_kernel(run_gridwise_convolution, + float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), 0, diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp deleted file mode 100644 index cebc92f907..0000000000 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp +++ /dev/null @@ -1,282 +0,0 @@ -#pragma once -#include -#include "device.hpp" -#include "gridwise_convolution_wrapper.hpp" -#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp" - -template -void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, - const Tensor& in_nchw, - WeiDesc, - const Tensor& wei_kcyx, - OutDesc, - Tensor& out_nkhw, - index_t nrepeat) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_nchw_desc = InDesc{}; - constexpr auto wei_kcyx_desc = WeiDesc{}; - constexpr auto out_nkhw_desc = OutDesc{}; - - constexpr index_t Hi = in_nchw_desc.GetLength(I2); - constexpr index_t Wi = in_nchw_desc.GetLength(I3); - - constexpr index_t N = out_nkhw_desc.GetLength(I0); - constexpr index_t Ho = out_nkhw_desc.GetLength(I2); - constexpr index_t Wo = out_nkhw_desc.GetLength(I3); - - constexpr index_t K = wei_kcyx_desc.GetLength(I0); - constexpr index_t C = wei_kcyx_desc.GetLength(I1); - constexpr index_t Y = wei_kcyx_desc.GetLength(I2); - constexpr index_t X = wei_kcyx_desc.GetLength(I3); - - // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); - - Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); - - auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) { - wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); - }; - - make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)( - std::thread::hardware_concurrency()); - - // output - auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); - - Tensor out_khwn(make_TensorDescriptor(out_khwn_desc)); - - std::size_t data_sz = sizeof(T); - DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace()); - DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace()); - DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace()); - - in_nchw_device_buf.ToDevice(in_nchw.mData.data()); - wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); - out_khwn_device_buf.ToDevice(out_khwn.mData.data()); - -#if 1 - // for 3x3, 34x34, v1r3, Pascal - constexpr index_t BlockSize = 128; - - constexpr index_t NPerBlock = 2; - constexpr index_t KPerBlock = 128; - constexpr index_t CPerBlock = 8; - constexpr index_t HoPerBlock = 2; - constexpr index_t WoPerBlock = 16; - - constexpr index_t NPerThread = 2; - constexpr index_t KPerThread = 8; - constexpr index_t HoPerThread = 1; - constexpr index_t WoPerThread = 4; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 2; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>; - using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>; - using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; - constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW - constexpr index_t InBlockReorderDataPerWrite_N = 2; - - using WeiBlockCopyClusterLengths = void; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; - - constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#elif 1 - // for 3x3, 34x34, v1r3, Vega 20 - constexpr index_t BlockSize = 256; - - constexpr index_t NPerBlock = 2; - constexpr index_t KPerBlock = 128; - constexpr index_t CPerBlock = 8; - constexpr index_t HoPerBlock = 4; - constexpr index_t WoPerBlock = 16; - - constexpr index_t NPerThread = 2; - constexpr index_t KPerThread = 8; - constexpr index_t HoPerThread = 1; - constexpr index_t WoPerThread = 4; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 2; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>; - using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 2, 16>; - using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; - constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW - constexpr index_t InBlockReorderDataPerWrite_N = 2; - - using WeiBlockCopyClusterLengths = void; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; - - constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#elif 0 - // for 3x3, 28x28, v1r2, Pascal - constexpr index_t BlockSize = 128; - - constexpr index_t NPerBlock = 16; - constexpr index_t KPerBlock = 128; - constexpr index_t CPerBlock = 8; - constexpr index_t HoPerBlock = 2; - constexpr index_t WoPerBlock = 2; - - constexpr index_t NPerThread = 4; - constexpr index_t KPerThread = 8; - constexpr index_t HoPerThread = 1; - constexpr index_t WoPerThread = 2; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 2; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 2>; - using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; - using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; - constexpr index_t InBlockReorderDataPerRead_W = 2; - constexpr index_t InBlockReorderDataPerWrite_N = 4; - - using WeiBlockCopyClusterLengths = Sequence<4, 1, 32>; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; - - constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#elif 0 - // for 3x3, 28x28, v1r3, Pascal, bad - constexpr index_t BlockSize = 128; - - constexpr index_t NPerBlock = 16; - constexpr index_t KPerBlock = 128; - constexpr index_t CPerBlock = 8; - constexpr index_t HoPerBlock = 2; - constexpr index_t WoPerBlock = 2; - - constexpr index_t NPerThread = 4; - constexpr index_t KPerThread = 8; - constexpr index_t HoPerThread = 1; - constexpr index_t WoPerThread = 2; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 2; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>; - using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; - using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; - constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW - constexpr index_t InBlockReorderDataPerWrite_N = 1; - - using WeiBlockCopyClusterLengths = void; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; - - constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#endif - - constexpr index_t GridSize = - ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) * - ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock); - - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); - - for(index_t i = 0; i < nrepeat; ++i) - { - constexpr auto gridwise_conv = -#if 0 - GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn -#elif 0 - GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn -#elif 1 - GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn -#endif - {}; - - float time = launch_kernel(run_gridwise_convolution, - dim3(GridSize), - dim3(BlockSize), - 0, - static_cast(in_nchw_device_buf.GetDeviceBuffer()), - static_cast(wei_cyxk_device_buf.GetDeviceBuffer()), - static_cast(out_khwn_device_buf.GetDeviceBuffer())); - - printf("Elapsed time : %f ms, %f TFlop/s\n", - time, - (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) / - (std::size_t(1000) * 1000 * 1000) / time); - usleep(std::min(time * 1000, float(10000))); - } - - out_khwn_device_buf.FromDevice(out_khwn.mData.data()); - - // reorder output - auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) { - out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n); - }; - - make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)( - std::thread::hardware_concurrency()); -} diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index 43c8512b87..b973d19237 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -1,9 +1,11 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp" + +using namespace ck; template void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, @@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, for(index_t i = 0; i < nrepeat; ++i) { constexpr auto gridwise_conv = -#if 1 +#if 0 GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw #else - GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw + GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer #endif {}; - float time = launch_kernel(run_gridwise_convolution, + float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), 0, diff --git a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp index 8033d32b53..09c29fce7b 100644 --- a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp @@ -1,10 +1,12 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp" +using namespace ck; + template void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, const Tensor& in_nchw, @@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, WeiBlockCopyDataPerRead, OutThreadCopyDataPerWrite>{}; - float time = launch_kernel(run_gridwise_convolution, + float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), 0, diff --git a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp index 17feafef95..3a7ea0ab7b 100644 --- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp @@ -1,9 +1,11 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" -#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp" +#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp" + +using namespace ck; template void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, @@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, #if 0 GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw #else - GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw + GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer #endif {}; #if 1 - float time = launch_kernel(run_gridwise_convolution, + float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), 0, diff --git a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp index 0704ddc867..44434d46e8 100644 --- a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp @@ -1,9 +1,11 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp" -#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp" +#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp" + +using namespace ck; template void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, @@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, #if 0 GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw #else - GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw + GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer #endif {}; - float time = launch_kernel(run_gridwise_convolution, + float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), 0, diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index cddbb4df3c..386abe5ddd 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -3,6 +3,8 @@ #include "device.hpp" #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" +using namespace ck; + template void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, const Tensor& in_nchw, diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp index f8ed51516d..68011d8d9a 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp +++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp @@ -3,6 +3,8 @@ #include "device.hpp" #include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" +using namespace ck; + template void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc, const Tensor& in_nchw, diff --git a/driver/driver.cpp b/driver/driver.cpp index ededb0d87f..636a1b1109 100644 --- a/driver/driver.cpp +++ b/driver/driver.cpp @@ -3,19 +3,19 @@ #include #include #include -#include "config.h" +#include "config.hpp" #include "tensor.hpp" #include "ConstantTensorDescriptor.hpp" #include "conv_common.hpp" #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" -//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" -#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp" +using namespace ck; + struct GeneratorTensor_1 { template @@ -419,7 +419,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; @@ -633,15 +633,9 @@ int main(int argc, char* argv[]) #if 1 #if 0 - device_direct_convolution_1 -#elif 0 device_convolution_direct_v2_nchw_kcyx_nkhw -#elif 0 - device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 0 device_convolution_implicit_gemm_v1_chwn_cyxk_khwn -#elif 0 - device_convolution_implicit_gemm_v1_nchw_cyxk_khwn #elif 0 device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw #elif 0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 31300c8600..a34b7b3c89 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -configure_file("${PROJECT_SOURCE_DIR}/src/include/config.h.in" "${PROJECT_BINARY_DIR}/src/include/config.h") +configure_file("${PROJECT_SOURCE_DIR}/src/include/config.hpp.in" "${PROJECT_BINARY_DIR}/src/include/config.hpp") set(TENSOR_SOURCE tensor.cpp; diff --git a/src/device.cpp b/src/device.cpp index 03bd208d4d..f65597ca76 100644 --- a/src/device.cpp +++ b/src/device.cpp @@ -1,4 +1,4 @@ -#include "config.h" +#include "config.hpp" #include "device.hpp" DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) diff --git a/src/include/Array.hpp b/src/include/Array.hpp index 9f9192bad9..fcf87c5843 100644 --- a/src/include/Array.hpp +++ b/src/include/Array.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_ARRAY_HPP +#define CK_ARRAY_HPP + #include "Sequence.hpp" #include "functional2.hpp" +namespace ck { + template struct Array { @@ -96,7 +100,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array>::value, "wrong! invalid reorder map"); - return Array{old_array.mSize[IRs]...}; + return Array{old_array[IRs]...}; } template @@ -180,7 +184,7 @@ __host__ __device__ constexpr auto operator+(Array a, Array result; - auto f = mod_conv::plus{}; + auto f = math::plus{}; static_for<0, NSize, 1>{}( lambda_array_math( @@ -195,7 +199,7 @@ __host__ __device__ constexpr auto operator-(Array a, Array result; - auto f = mod_conv::minus{}; + auto f = math::minus{}; static_for<0, NSize, 1>{}( lambda_array_math( @@ -212,7 +216,7 @@ __host__ __device__ constexpr auto operator+(Array a, Sequence result; - auto f = mod_conv::plus{}; + auto f = math::plus{}; static_for<0, NSize, 1>{}( lambda_array_math( @@ -229,7 +233,7 @@ __host__ __device__ constexpr auto operator-(Array a, Sequence result; - auto f = mod_conv::minus{}; + auto f = math::minus{}; static_for<0, NSize, 1>{}( lambda_array_math( @@ -246,7 +250,7 @@ __host__ __device__ constexpr auto operator*(Array a, Sequence result; - auto f = mod_conv::multiplies{}; + auto f = math::multiplies{}; static_for<0, NSize, 1>{}( lambda_array_math( @@ -263,7 +267,7 @@ __host__ __device__ constexpr auto operator-(Sequence a, Array result; - auto f = mod_conv::minus{}; + auto f = math::minus{}; static_for<0, NSize, 1>{}( lambda_array_math( @@ -368,3 +372,6 @@ __host__ __device__ void print_Array(const char* s, Array a) a[9]); }); } + +} // namespace ck +#endif diff --git a/src/include/ConstantMatrixDescriptor.hpp b/src/include/ConstantMatrixDescriptor.hpp index 8339580d01..a2b88001ac 100644 --- a/src/include/ConstantMatrixDescriptor.hpp +++ b/src/include/ConstantMatrixDescriptor.hpp @@ -1,6 +1,10 @@ -#pragma once +#ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP +#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP + #include "common.hpp" +namespace ck { + template struct ConstantMatrixDescriptor { @@ -57,3 +61,7 @@ __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s) printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride()); } + +} // namespace ck + +#endif diff --git a/src/include/ConstantMergedTensorDescriptor.hpp b/src/include/ConstantMergedTensorDescriptor.hpp index 21a08a3b67..244bd0eec7 100644 --- a/src/include/ConstantMergedTensorDescriptor.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP +#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" +namespace ck { + // OriginalTensorDesc : ConstantTensorDescriptor<...> // it's the tensor whose dimensions are to be merged // OriginalDimMergeSeqs : Sequence<...>... @@ -184,3 +188,6 @@ __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDe { print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor()); } + +} // namespace ck +#endif diff --git a/src/include/ConstantTensorDescriptor.hpp b/src/include/ConstantTensorDescriptor.hpp index f2decc3f54..5325259858 100644 --- a/src/include/ConstantTensorDescriptor.hpp +++ b/src/include/ConstantTensorDescriptor.hpp @@ -1,11 +1,15 @@ -#pragma once +#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP +#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP + #include "common.hpp" +namespace ck { + template __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) { return reverse_inclusive_scan_sequence( - Lengths{}.PopFront(), mod_conv::multiplies{}, Number<1>{}) + Lengths{}.PopFront(), math::multiplies{}, Number<1>{}) .PushBack(Number<1>{}); } @@ -13,7 +17,7 @@ template __host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number) { constexpr index_t L_back_align = - Align * mod_conv::integer_divide_ceiler{}(Lengths{}.Back(), Align); + Align * math::integer_divide_ceiler{}(Lengths{}.Back(), Align); return calculate_tensor_strides_packed( Lengths{}.Modify(Number{}, Number{})); @@ -100,7 +104,7 @@ struct ConstantTensorDescriptor __host__ __device__ static constexpr index_t GetElementSize() { - return accumulate_on_sequence(Lengths{}, mod_conv::multiplies{}, Number<1>{}); + return accumulate_on_sequence(Lengths{}, math::multiplies{}, Number<1>{}); } template > @@ -109,7 +113,7 @@ struct ConstantTensorDescriptor // This is WRONG! align shouldbe applied to the last memory rank, not the last tensor // dimension constexpr index_t element_space_unaligned = accumulate_on_sequence( - (GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus{}, Number<1>{}); + (GetLengths() - Number<1>{}) * GetStrides(), math::plus{}, Number<1>{}); return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); } @@ -161,8 +165,7 @@ struct ConstantTensorDescriptor constexpr auto multi_id = Sequence{}; - return accumulate_on_sequence( - multi_id * GetStrides(), mod_conv::plus{}, Number<0>{}); + return accumulate_on_sequence(multi_id * GetStrides(), math::plus{}, Number<0>{}); } // emulate constexpr lambda @@ -323,7 +326,7 @@ struct ConstantTensorDescriptor constexpr auto fold_intervals = Sequence{}; constexpr index_t fold_intervals_product = - accumulate_on_sequence(fold_intervals, mod_conv::multiplies{}, Number<1>{}); + accumulate_on_sequence(fold_intervals, math::multiplies{}, Number<1>{}); constexpr auto unfold_length = GetLength(Number{}); constexpr auto unfold_stride = GetStride(Number{}); @@ -341,7 +344,7 @@ struct ConstantTensorDescriptor constexpr auto fold_strides = Number{} * reverse_inclusive_scan_sequence( - fold_intervals.PushBack(Number<1>{}), mod_conv::multiplies{}, Number<1>{}); + fold_intervals.PushBack(Number<1>{}), math::multiplies{}, Number<1>{}); // left and right constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{}; @@ -376,7 +379,7 @@ struct ConstantTensorDescriptor // unfolded length, stride constexpr index_t unfold_length = accumulate_on_sequence( - GetLengths().Extract(middle), mod_conv::multiplies{}, Number<1>{}); + GetLengths().Extract(middle), math::multiplies{}, Number<1>{}); constexpr index_t unfold_stride = GetStride(Number{}); @@ -511,3 +514,6 @@ print_ConstantTensorDescriptor(const char* s, Strides...); }); } + +} // namespace ck +#endif diff --git a/src/include/Sequence.hpp b/src/include/Sequence.hpp index 5d021631d6..09d67c13fa 100644 --- a/src/include/Sequence.hpp +++ b/src/include/Sequence.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_SEQUENCE_HPP +#define CK_SEQUENCE_HPP + #include "integral_constant.hpp" #include "functional.hpp" +namespace ck { + template struct is_valid_sequence_map; @@ -547,3 +551,6 @@ __host__ __device__ void print_Sequence(const char* s, Sequence) static_if{}( [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); } + +} // namespace ck +#endif diff --git a/src/include/amd_inline_asm.hpp b/src/include/amd_inline_asm.hpp index 05e34a26be..70e8cf31ba 100644 --- a/src/include/amd_inline_asm.hpp +++ b/src/include/amd_inline_asm.hpp @@ -1,4 +1,6 @@ -#pragma once +#ifndef CK_AMD_INLINE_ASM_HPP +#define CK_AMD_INLINE_ASM_HPP + #include "common.hpp" #define NO_VM_WAIT 0 @@ -7,6 +9,8 @@ #define NO_DS_WRITE 0 #define NO_GLB_READ 0 +namespace ck { + // cast a pointer of LDS to its address extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; @@ -759,3 +763,6 @@ ds_write_b128(const vector_type::MemoryType& r, void* lds, index_t off } #endif } + +} // namespace ck +#endif diff --git a/src/include/blockwise_2d_tensor_op.hpp b/src/include/blockwise_2d_tensor_op.hpp index d39a74a1a3..aeb4738d7e 100644 --- a/src/include/blockwise_2d_tensor_op.hpp +++ b/src/include/blockwise_2d_tensor_op.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_BLOCKWISE_2D_TENSOR_OP_HPP +#define CK_BLOCKWISE_2D_TENSOR_OP_HPP + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" +namespace ck { + template __device__ void blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f) @@ -192,7 +196,7 @@ struct Blockwise2dTensorCopy1 // but we need to make sure dst stride0 is big enough, // so that the out-of-bound write won't contaminate next line in dst constexpr index_t L1 = CopyLengths{}.Get(I1); - constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead); + constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead); static_assert(read_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0), "wrong! out-of-bound write will contaminate next line!\n"); @@ -209,7 +213,7 @@ struct Blockwise2dTensorCopy1 constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L1 = CopyLengths{}.Get(I1); - constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead); + constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead); constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence{}); @@ -676,7 +680,7 @@ struct Blockwise2dTensorCopy3 } } -#if USE_AMD_INLINE_ASM +#if CK_USE_AMD_INLINE_ASM __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src, Float* p_clipboard) const { @@ -796,3 +800,7 @@ struct Blockwise2dTensorCopy3 } #endif }; + +} // namespace ck + +#endif diff --git a/src/include/blockwise_3d_tensor_op.hpp b/src/include/blockwise_3d_tensor_op.hpp index ad647fc9da..4633dc1075 100644 --- a/src/include/blockwise_3d_tensor_op.hpp +++ b/src/include/blockwise_3d_tensor_op.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_BLOCKWISE_3D_TENSOR_OP_HPP +#define CK_BLOCKWISE_3D_TENSOR_OP_HPP + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" +namespace ck { + template {}); @@ -146,7 +150,7 @@ struct Blockwise3dTensorCopy3 // we allow out-of-bound read from src in D2 dimension, // but we need to make sure dst stride is big enough, // so that the out-of-bound write won't contaminate next line in dst - constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); + constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); static_assert(nloop_d2 * thread_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1), "wrong! out-of-bound write will contaminate next line!\n"); @@ -158,7 +162,7 @@ struct Blockwise3dTensorCopy3 "wrrong! BlockSize is not big enough for ThreadPerDims!"); constexpr index_t num_active_thread = - accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies{}, Number<1>{}); + accumulate_on_sequence(ThreadPerDims{}, math::multiplies{}, Number<1>{}); if(BlockSize > num_active_thread) { @@ -205,7 +209,7 @@ struct Blockwise3dTensorCopy3 constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; - constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); + constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); #pragma unroll for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0) @@ -251,7 +255,7 @@ struct Blockwise3dTensorCopy3 constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; - constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); + constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2; } @@ -283,7 +287,7 @@ struct Blockwise3dTensorCopy3 constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; - constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); + constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); constexpr auto clipboard_desc = make_ConstantTensorDescriptor(Sequence{}); @@ -339,7 +343,7 @@ struct Blockwise3dTensorCopy3 constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; - constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); + constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); constexpr auto clipboard_desc = make_ConstantTensorDescriptor(Sequence{}); @@ -368,3 +372,7 @@ struct Blockwise3dTensorCopy3 } } }; + +} // namespace ck + +#endif diff --git a/src/include/blockwise_4d_tensor_op.hpp b/src/include/blockwise_4d_tensor_op.hpp index df8f12be3e..51c30b08e4 100644 --- a/src/include/blockwise_4d_tensor_op.hpp +++ b/src/include/blockwise_4d_tensor_op.hpp @@ -1,6 +1,10 @@ -#pragma once +#ifndef CK_BLOCKWISE_4D_TENSOR_OP_HPP +#define CK_BLOCKWISE_4D_TENSOR_OP_HPP + #include "ConstantTensorDescriptor.hpp" -#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" + +namespace ck { template __device__ void @@ -235,7 +239,7 @@ struct Blockwise4dTensorCopy1 // but we need to make sure dst stride2 is big enough, // so that the out-of-bound write won't contaminate next line in dst constexpr index_t L3 = CopyLengths{}.Get(I3); - constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead); + constexpr index_t read_per_d3 = math::integer_divide_ceil(L3, DataPerRead); static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2), "wrong! out-of-bound write will contaminate next line!\n"); @@ -256,7 +260,7 @@ struct Blockwise4dTensorCopy1 constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L3 = CopyLengths{}.Get(I3); - constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead); + constexpr index_t read_per_d3 = math::integer_divide_ceil(L3, DataPerRead); constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(Sequence{}); @@ -488,7 +492,7 @@ struct Blockwise4dTensorCopy3 // we allow out-of-bound read from src in D3 dimension, // but we need to make sure dst stride is big enough, // so that the out-of-bound write won't contaminate next line in dst - constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); + constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); static_assert(nloop_d3 * thread_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2), "wrong! out-of-bound write will contaminate next line!\n"); @@ -500,7 +504,7 @@ struct Blockwise4dTensorCopy3 "wrrong! BlockSize is not big enough for ThreadPerDims!"); constexpr index_t num_active_thread = - accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies{}, Number<1>{}); + accumulate_on_sequence(ThreadPerDims{}, math::multiplies{}, Number<1>{}); if(BlockSize > num_active_thread) { @@ -556,7 +560,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d2 = L2 / thread_per_d2; - constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); + constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); #pragma unroll for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0) @@ -613,7 +617,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d2 = L2 / thread_per_d2; - constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); + constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3; } @@ -650,7 +654,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d2 = L2 / thread_per_d2; - constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); + constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed( Sequence{}); @@ -717,7 +721,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d2 = L2 / thread_per_d2; - constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); + constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed( Sequence{}); @@ -768,3 +772,7 @@ struct Blockwise4dTensorCopyReorder1 SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy); } }; + +} // namespace + +#endif diff --git a/src/include/blockwise_batched_gemm.hpp b/src/include/blockwise_batched_gemm.hpp index 937bf5ee8a..340c25df55 100644 --- a/src/include/blockwise_batched_gemm.hpp +++ b/src/include/blockwise_batched_gemm.hpp @@ -1,6 +1,10 @@ -#pragma once +#ifndef CK_BLOCKWISE_BATCHED_GEMM_HPP +#define CK_BLOCKWISE_BATCHED_GEMM_HPP + #include "threadwise_gemm.hpp" +namespace ck { + template __device__ void Run_asm(const FloatA* __restrict__ p_a_block, const FloatB* __restrict__ p_b_block, @@ -518,3 +522,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 } } }; + +} // namespace +#endif diff --git a/src/include/blockwise_gemm.hpp b/src/include/blockwise_gemm.hpp index 31fb4ed15c..0fc9a7bb67 100644 --- a/src/include/blockwise_gemm.hpp +++ b/src/include/blockwise_gemm.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_BLOCKWISE_GEMM_HPP +#define CK_BLOCKWISE_GEMM_HPP + #include "common.hpp" #include "threadwise_gemm.hpp" +namespace ck { + // if following number are power of 2, index calculation shall be greatly reduced: // MPerThreadSubC, NPerThreadSubC, MLevel0Cluster, NLevel0Cluster, MLevel1Cluster, NLevel1Cluster template __device__ void Run_asm(const FloatA* __restrict__ p_a_block, @@ -423,3 +427,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 } } }; + +} // namespace ck +#endif diff --git a/src/include/blockwise_generic_tensor_slice_op.hpp b/src/include/blockwise_generic_tensor_slice_copy.hpp similarity index 98% rename from src/include/blockwise_generic_tensor_slice_op.hpp rename to src/include/blockwise_generic_tensor_slice_copy.hpp index d9b9a3a2e5..6b887380e7 100644 --- a/src/include/blockwise_generic_tensor_slice_op.hpp +++ b/src/include/blockwise_generic_tensor_slice_copy.hpp @@ -1,5 +1,9 @@ -#pragma once -#include "threadwise_tensor_slice_op.hpp" +#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP +#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP + +#include "threadwise_generic_tensor_slice_copy.hpp" + +namespace ck { // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor // memory layout (ordering of dimensions) can be different between src and dst @@ -142,10 +146,10 @@ struct BlockwiseGenericTensorSliceCopy_v1 // complete offset mThreadSrcOffset = accumulate_on_array( - mThreadSrcPartialOffsets, mod_conv::plus{}, static_cast(0)); + mThreadSrcPartialOffsets, math::plus{}, static_cast(0)); mThreadDstOffset = accumulate_on_array( - mThreadDstPartialOffsets, mod_conv::plus{}, static_cast(0)); + mThreadDstPartialOffsets, math::plus{}, static_cast(0)); #if 0 if(get_block_1d_id() == 0) @@ -388,3 +392,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); } }; + +} // namespace ck + +#endif diff --git a/src/include/blockwise_tensor_slice_op.hpp b/src/include/blockwise_tensor_slice_copy.hpp similarity index 94% rename from src/include/blockwise_tensor_slice_op.hpp rename to src/include/blockwise_tensor_slice_copy.hpp index 915920588a..ed0f12c9a5 100644 --- a/src/include/blockwise_tensor_slice_op.hpp +++ b/src/include/blockwise_tensor_slice_copy.hpp @@ -1,5 +1,9 @@ -#pragma once -#include "threadwise_tensor_slice_op.hpp" +#ifndef CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP +#define CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP + +#include "threadwise_tensor_slice_copy.hpp" + +namespace ck { template {}, - SrcLengths{}, - src_data_per_cluster_per_dims); + constexpr auto repeat_lengths = transform_sequences( + math::integer_divide_ceiler{}, SrcLengths{}, src_data_per_cluster_per_dims); constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; @@ -188,10 +190,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto src_data_per_cluster_per_dims = thread_sub_tensor_lengths * SrcClusterLengths{}; - constexpr auto repeat_lengths = - transform_sequences(mod_conv::integer_divide_ceiler{}, - SrcLengths{}, - src_data_per_cluster_per_dims); + constexpr auto repeat_lengths = transform_sequences( + math::integer_divide_ceiler{}, SrcLengths{}, src_data_per_cluster_per_dims); constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; @@ -226,10 +226,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto src_data_per_cluster_per_dims = thread_sub_tensor_lengths * SrcClusterLengths{}; - constexpr auto repeat_lengths = - transform_sequences(mod_conv::integer_divide_ceiler{}, - SrcLengths{}, - src_data_per_cluster_per_dims); + constexpr auto repeat_lengths = transform_sequences( + math::integer_divide_ceiler{}, SrcLengths{}, src_data_per_cluster_per_dims); constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; @@ -294,3 +292,6 @@ struct BlockwiseTensorSliceReorderCopy_v3 }).Else([&](auto fwd) { mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); }); } }; + +} // namespace ck +#endif diff --git a/src/include/common.hpp b/src/include/common.hpp index 34c2af6c39..e52aa7741f 100644 --- a/src/include/common.hpp +++ b/src/include/common.hpp @@ -1,5 +1,7 @@ -#pragma once -#include "base.hpp" +#ifndef CK_COMMON_HPP +#define CK_COMMON_HPP + +#include "utility.hpp" #include "vector_type.hpp" #include "integral_constant.hpp" #include "Sequence.hpp" @@ -8,6 +10,8 @@ #include "functional2.hpp" #include "functional3.hpp" -#if USE_AMD_INLINE_ASM +#if CK_USE_AMD_INLINE_ASM #include "amd_inline_asm.hpp" #endif + +#endif diff --git a/src/include/config.h.in b/src/include/config.hpp.in similarity index 89% rename from src/include/config.h.in rename to src/include/config.hpp.in index c97d71def2..1bd7500c12 100644 --- a/src/include/config.h.in +++ b/src/include/config.hpp.in @@ -1,23 +1,30 @@ -#pragma once +#ifndef CK_CONFIG_HPP +#define CK_CONFIG_HPP + #cmakedefine01 DEVICE_BACKEND_HIP #cmakedefine01 DEVICE_BACKEND_CUDA #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" -#define USE_AMD_INLINE_ASM 1 +#define CK_USE_AMD_INLINE_ASM 1 -// For some reason, HIP compiler need this definition to generate optimal load and store -// instruction -typedef float float2_t __attribute__((ext_vector_type(2))); -typedef float float4_t __attribute__((ext_vector_type(4))); #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" #include "cuda_fp16.h" #include "nvToolsExt.h" #include "helper_cuda.h" -#define USE_AMD_INLINE_ASM 0 +#define CK_USE_AMD_INLINE_ASM 0 +#endif +namespace ck { + +#if DEVICE_BACKEND_HIP +// For some reason, HIP compiler need this definition to generate optimal load and store +// instruction +typedef float float2_t __attribute__((ext_vector_type(2))); +typedef float float4_t __attribute__((ext_vector_type(4))); +#else // For some reason, CUDA need this definition, otherwise // compiler won't generate optimal load and store instruction, and // kernel would produce wrong result, indicating the compiler fail to generate correct @@ -58,3 +65,7 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i #endif } #endif + +} // namespace ck + +#endif diff --git a/src/include/conv_common.hpp b/src/include/conv_common.hpp index 6d8a20d8fa..254f4c5651 100644 --- a/src/include/conv_common.hpp +++ b/src/include/conv_common.hpp @@ -1,6 +1,10 @@ -#pragma once +#ifndef CK_CONV_COMMON_HPP +#define CK_CONV_COMMON_HPP + #include "ConstantTensorDescriptor.hpp" +using namespace ck; + // this is ugly, only for 4d template constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc) @@ -117,3 +121,5 @@ constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc, return sizeof(Float) * (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace()); } + +#endif diff --git a/src/include/device.hpp b/src/include/device.hpp index 066866858b..5766d8f990 100644 --- a/src/include/device.hpp +++ b/src/include/device.hpp @@ -1,6 +1,10 @@ -#pragma once +#ifndef CK_DEVICE_HPP +#define CK_DEVICE_HPP + #include -#include "config.h" +#include "config.hpp" + +using namespace ck; struct DeviceMem { @@ -56,3 +60,5 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt return timer.GetElapsedTime(); } + +#endif diff --git a/src/include/functional.hpp b/src/include/functional.hpp index 84e5cffe83..15c957e1c5 100644 --- a/src/include/functional.hpp +++ b/src/include/functional.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_FUNCTIONAL_HPP +#define CK_FUNCTIONAL_HPP + #include "integral_constant.hpp" #include "Sequence.hpp" +namespace ck { + struct forwarder { template @@ -70,3 +74,6 @@ struct static_if return Type{}; } }; + +} // namespace ck +#endif diff --git a/src/include/functional2.hpp b/src/include/functional2.hpp index c1dec36575..3820056593 100644 --- a/src/include/functional2.hpp +++ b/src/include/functional2.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_FUNCTIONAL2_HPP +#define CK_FUNCTIONAL2_HPP + #include "functional.hpp" #include "Sequence.hpp" +namespace ck { + template struct static_for_impl; @@ -59,3 +63,6 @@ accumulate_on_sequence(Seq, Reduce f, Number /*initial_value*/) return result; } + +} // namespace ck +#endif diff --git a/src/include/functional3.hpp b/src/include/functional3.hpp index ee3ab656f4..fc5f8a6bab 100644 --- a/src/include/functional3.hpp +++ b/src/include/functional3.hpp @@ -1,9 +1,13 @@ -#pragma once +#ifndef CK_FUNCTIONAL3_HPP +#define CK_FUNCTIONAL3_HPP + #include "functional.hpp" #include "functional2.hpp" #include "Sequence.hpp" #include "Array.hpp" +namespace ck { + // RemainLengths: Sequence<...> template struct static_ford_impl @@ -107,3 +111,6 @@ struct ford } } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp index 5bc5aa39a3..83b2a67f7f 100644 --- a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp +++ b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp @@ -1,11 +1,15 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW +#define CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "blockwise_2d_tensor_op.hpp" #include "blockwise_4d_tensor_op.hpp" -#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" #include "threadwise_direct_convolution.hpp" +namespace ck { + template {}); } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp index 2ba42fb723..8c42441fac 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp @@ -1,13 +1,17 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_4d_tensor_op.hpp" #include "blockwise_2d_tensor_op.hpp" -#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" #include "threadwise_4d_tensor_op.hpp" #include "blockwise_batched_gemm.hpp" +namespace ck { + template {}, @@ -119,11 +123,11 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn constexpr auto wei_cyx_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); constexpr auto wei_c_y_x_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( @@ -390,3 +394,6 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn }); } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp index e9744ed2fb..6d7bc47d37 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp @@ -1,14 +1,18 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_2d_tensor_op.hpp" #include "blockwise_3d_tensor_op.hpp" #include "blockwise_4d_tensor_op.hpp" -#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" #include "threadwise_4d_tensor_op.hpp" #include "blockwise_batched_gemm.hpp" +namespace ck { + template {}, @@ -120,7 +124,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( @@ -426,3 +430,6 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn }); } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp deleted file mode 100644 index dd21a67eea..0000000000 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp +++ /dev/null @@ -1,352 +0,0 @@ -#pragma once -#include "common.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMatrixDescriptor.hpp" -#include "blockwise_2d_tensor_op.hpp" -#include "blockwise_3d_tensor_op.hpp" -#include "blockwise_tensor_slice_op.hpp" -#include "threadwise_tensor_slice_op.hpp" -#include "threadwise_4d_tensor_op.hpp" -#include "blockwise_batched_gemm.hpp" - -template -struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn -{ - __device__ void Run(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) const - { - // be careful of this assertion - static_assert( - NPerBlock % NPerThread == 0 && - ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0)), - "wrong!"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; - constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; - constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{}; - - constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); - - constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0); - constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1); - constexpr index_t Wo = out_k_h_w_n_global_desc.GetLength(I2); - constexpr index_t N = out_k_h_w_n_global_desc.GetLength(I3); - - constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1); - constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2); - - constexpr index_t HiPerBlock = HoPerBlock + Y - 1; - constexpr index_t WiPerBlock = WoPerBlock + X - 1; - - // divide block work: [K, Ho, Wo, N] - static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && - Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, - "wrong! cannot evenly divide work for workgroup "); - - constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; - constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock; - constexpr index_t WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock; - constexpr index_t NBlockWork = (N + NPerBlock - 1) / NPerBlock; - - const index_t k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork); - index_t itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork); - const index_t h_block_work_id = itmp / (WBlockWork * NBlockWork); - itmp -= h_block_work_id * (WBlockWork * NBlockWork); - const index_t w_block_work_id = itmp / NBlockWork; - const index_t n_block_work_id = itmp - w_block_work_id * NBlockWork; - - const index_t k_block_data_begin = k_block_work_id * KPerBlock; - const index_t ho_block_data_begin = h_block_work_id * HoPerBlock; - const index_t wo_block_data_begin = w_block_work_id * WoPerBlock; - const index_t n_block_data_begin = n_block_work_id * NPerBlock; - - const index_t hi_block_data_begin = ho_block_data_begin; - const index_t wi_block_data_begin = wo_block_data_begin; - - // global tensor view - constexpr auto wei_c_x_k_global_desc = - make_ConstantTensorDescriptor(Sequence{}, Sequence{}); - - // LDS tensor view - // be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, - WeiBlockCopyDataPerRead_K, - GemmDataPerReadA, - GemmDataPerReadB); - - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); - - constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); - - // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); - - // blockwise copy - // input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N] - constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; - - const auto blockwise_in_copy_reorder = BlockwiseTensorSliceReorderCopy_v3< - BlockSize, - Float, - decltype(in_n_c_h_w_global_desc), - decltype(in_c_h_w_n_block_desc), - Sequence, - InBlockReorderSrcSubLengths_NCHW, - InBlockReorderSrcClusterLengths_NCHW, - decltype(map_chwn2nchw), - InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, - InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>{}; - - // blockwise wei copy - // format is [CPerBlock, X * KPerBlock] - const auto blockwise_wei_copy = - Blockwise3dTensorCopy3{}; - - // a series of blockwise batched GEMM - // C_matrix += transpose(A_matrix) * B_matrix - // A_matrix and B_matrix saved in LDS, C_matrix saved in register - // A_matrix[C,K] is a sub-matrix of wei_block[C,K] - // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] - // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] - constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto b_c_wn_block_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); - - constexpr auto c_k_wn_thread_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); - - const auto blockwise_batch_gemm = - BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2< - BlockSize, - decltype(a_c_k_block_mtx_desc), - decltype(b_c_wn_block_mtx_desc), - decltype(c_k_wn_thread_mtx_desc), - 0, - in_c_h_w_n_block_desc.GetStride(I1), - out_k_h_w_n_thread_desc.GetStride(I1), - HoPerBlock, - GemmMPerThreadSubC, - GemmNPerThreadSubC, - GemmMLevel0Cluster, - GemmNLevel0Cluster, - GemmMLevel1Cluster, - GemmNLevel1Cluster, - GemmKPerThreadLoop, - HoPerThread, - GemmDataPerReadA, - GemmDataPerReadB>{}; - - // LDS: be careful of alignment - constexpr index_t in_block_space = - in_c_h_w_n_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_space = - wei_c_x_k_block_desc.GetElementSpace(Number{}); - - __shared__ Float p_in_block[in_block_space]; - __shared__ Float p_wei_block[wei_block_space]; - - // register - Float p_out_thread[out_k_h_w_n_thread_desc.GetElementSpace()]; - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc"); - print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc"); - - print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc"); - print_ConstantTensorDescriptor(wei_c_x_k_block_desc, "wei_c_x_k_block_desc"); - - printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space); - } -#endif - - // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); - -#if 0 - const Float* p_in_global_block_offset = - p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( - n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); - - const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); - - for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, - p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), - p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) - { - for(index_t y = 0; y < Y; ++y) - { - blockwise_in_copy_reorder.Run(p_in_global_block_offset + - in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, 0), - p_in_block); - - blockwise_wei_copy.Run(p_wei_global_block_offset + - wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0), - p_wei_block); - - __syncthreads(); - - for(index_t x = 0; x < X; ++x) - { - blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0), - p_in_block + - in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0), - p_out_thread); - } - - __syncthreads(); - } - } -#else - for(index_t y = 0; y < Y; ++y) - { - const Float* p_in_global_block_offset = - p_in_global + - in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( - n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin); - - const Float* p_wei_global_block_offset = - p_wei_global + - wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin); - - for(index_t - c_block_data_begin = 0; - c_block_data_begin < C; - c_block_data_begin += CPerBlock, - p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), - p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) - { - Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; - Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_clipboard); - - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block); - - __syncthreads(); - - for(index_t x = 0; x < X; ++x) - { - blockwise_batch_gemm.Run( - p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0), - p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0), - p_out_thread); - } - - __syncthreads(); - } - } -#endif - - // output: register to global mem, - const auto c_thread_mtx_begin = - blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - - const index_t k_thread_data_begin = c_thread_mtx_begin.row; - const index_t ho_thread_data_begin = c_thread_mtx_begin.batch; - const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; - const index_t n_thread_data_begin = - c_thread_mtx_begin.col - NPerBlock * wo_thread_data_begin; - - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; - - constexpr index_t W2 = - (GemmNLevel0Cluster * GemmNLevel1Cluster) / (NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; - - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; - - constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( - Sequence{}); - - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_khwn_thread_desc, "out_khwn_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); - - print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - } -#endif - - threadwise_10d_tensor_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - } -}; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp index 8971cb747e..5f5d8a1d92 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp @@ -1,13 +1,17 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_2d_tensor_op.hpp" #include "blockwise_4d_tensor_op.hpp" -#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" #include "threadwise_4d_tensor_op.hpp" #include "blockwise_batched_gemm.hpp" +namespace ck { + template {}); @@ -99,10 +103,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N, - WeiBlockCopyDataPerRead_K, - GemmDataPerReadA, - GemmDataPerReadB); + constexpr index_t max_align = math::lcm(InBlockCopyDataPerRead_N, + WeiBlockCopyDataPerRead_K, + GemmDataPerReadA, + GemmDataPerReadB); constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, @@ -115,7 +119,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( @@ -416,3 +420,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn }); } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp similarity index 95% rename from src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp rename to src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp index 64210aef07..3ef597cb56 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -1,13 +1,17 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_2d_tensor_op.hpp" #include "blockwise_4d_tensor_op.hpp" -#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" #include "threadwise_4d_tensor_op.hpp" #include "blockwise_batched_gemm.hpp" +namespace ck { + template -struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn +struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer { __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, @@ -80,10 +84,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, "wrong! cannot evenly divide work for workgroup "); - constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock); - constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); - constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock); + constexpr index_t KBlockWork = math::integer_divide_ceil(K, KPerBlock); + constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock); + constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock); + constexpr index_t NBlockWork = math::integer_divide_ceil(N, NPerBlock); constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed( Sequence{}); @@ -104,10 +108,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N, - WeiBlockCopyDataPerRead_K, - GemmDataPerReadA, - GemmDataPerReadB); + constexpr index_t max_align = math::lcm(InBlockCopyDataPerRead_N, + WeiBlockCopyDataPerRead_K, + GemmDataPerReadA, + GemmDataPerReadB); constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, @@ -120,7 +124,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( @@ -466,3 +470,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn }); } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp deleted file mode 100644 index 8fb643159f..0000000000 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp +++ /dev/null @@ -1,501 +0,0 @@ -#pragma once -#include "common.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMatrixDescriptor.hpp" -#include "blockwise_2d_tensor_op.hpp" -#include "blockwise_tensor_slice_op.hpp" -#include "threadwise_tensor_slice_op.hpp" -#include "threadwise_4d_tensor_op.hpp" -#include "blockwise_batched_gemm.hpp" - -template -struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn -{ - __device__ void Run(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) const - { - // be careful of this assertion - static_assert( - NPerBlock % NPerThread == 0 && - ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0)), - "wrong!"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; - constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; - constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{}; - - constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); - - constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0); - constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1); - constexpr index_t Wo = out_k_h_w_n_global_desc.GetLength(I2); - constexpr index_t N = out_k_h_w_n_global_desc.GetLength(I3); - - constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1); - constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2); - - // assert for LDS double buffer - static_assert(C % (2 * CPerBlock) == 0, "C cannot be evenly divided"); - - // divide block work: [K, Ho, Wo, N] - static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && - Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, - "wrong! cannot evenly divide work for workgroup "); - - constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock); - constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock); - constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); - constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - - constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed( - Sequence{}); - - const auto block_work_multi_id = - block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); - - const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock; - const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock; - const index_t ho_block_data_begin = block_work_multi_id[2] * HoPerBlock; - const index_t wo_block_data_begin = block_work_multi_id[3] * WoPerBlock; - - const index_t hi_block_data_begin = ho_block_data_begin; - const index_t wi_block_data_begin = wo_block_data_begin; - - // global tensor view - constexpr auto wei_c_k_global_desc = - make_ConstantTensorDescriptor(Sequence{}, Sequence{}); - - // LDS tensor view - // be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, - WeiBlockCopyDataPerRead_K, - GemmDataPerReadA, - GemmDataPerReadB); - - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); - - // this check is ad-hoc - // TODO: need to properly implement tensor descriptor with alignment - static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, - "GemmDataPerReadB alignment requirement is not meet"); - - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); - - // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( - Sequence{}); - - // blockwise copy - // input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N] - constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; - - const auto blockwise_in_copy_reorder = BlockwiseTensorSliceReorderCopy_v3< - BlockSize, - Float, - decltype(in_n_c_h_w_global_desc), - decltype(in_c_h_w_n_block_desc), - Sequence, - InBlockReorderSrcSubLengths_NCHW, - InBlockReorderSrcClusterLengths_NCHW, - decltype(map_chwn2nchw), - InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, - InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>({0, 0, 0, 0}, {0, 0, 0, 0}); - - // blockwise wei copy - // format is [CPerBlock, KPerBlock] - const auto blockwise_wei_copy = - Blockwise2dTensorCopy3{}; - - // a series of blockwise batched GEMM - // C_matrix += transpose(A_matrix) * B_matrix - // A_matrix and B_matrix saved in LDS, C_matrix saved in register - // A_matrix[C,K] is a sub-matrix of wei_block[C,K] - // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] - // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] - constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto b_c_wn_block_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); - - constexpr auto c_k_wn_thread_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); - - const auto blockwise_batch_gemm = - BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2< - BlockSize, - decltype(a_c_k_block_mtx_desc), - decltype(b_c_wn_block_mtx_desc), - decltype(c_k_wn_thread_mtx_desc), - 0, - in_c_h_w_n_block_desc.GetStride(I1), - out_k_h_w_n_thread_desc.GetStride(I1), - HoPerBlock, - GemmMPerThreadSubC, - GemmNPerThreadSubC, - GemmMLevel0Cluster, - GemmNLevel0Cluster, - GemmMLevel1Cluster, - GemmNLevel1Cluster, - GemmKPerThreadLoop, - HoPerThread, - GemmDataPerReadA, - GemmDataPerReadB>{}; - - // choose GEMM implementation here - const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 1 - return blockwise_batch_gemm.Run(Xs...); -#elif 0 - return blockwise_batch_gemm.Run_asm(Xs...); -#else - return blockwise_batch_gemm.Run_asm_v2(Xs...); -#endif - }; - - // LDS: be careful of alignment - constexpr index_t in_block_space = - in_c_h_w_n_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number{}); - - // LDS double buffer - __shared__ Float p_in_block_double[2 * in_block_space]; - __shared__ Float p_wei_block_double[2 * wei_block_space]; - - // register - // C++ lambda doesn't capture array, use pointer instead - Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()]; - Float* const p_out_thread = p_out_thread_data; - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc"); - print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc"); - - print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc"); - print_ConstantTensorDescriptor(wei_c_k_block_desc, "wei_c_k_block_desc"); - - printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space); - } -#endif - - // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); - - for(index_t y = 0; y < Y; ++y) - { - for(index_t x = 0; x < X; ++x) - { - const Float* p_in_global_block_offset = - p_in_global + - in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( - n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x); - - const Float* p_wei_global_block_offset = - p_wei_global + - wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); - - // LDS double buffer: preload data into LDS - { - Float p_in_register_clipboard[blockwise_in_copy_reorder - .GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); - - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double); - } - - // LDS double buffer: main body - for(index_t c_block_data_begin = 0; c_block_data_begin + 2 * CPerBlock < C; - c_block_data_begin += 2 * CPerBlock) - { -#pragma unroll - for(index_t iloop = 0; iloop < 2; ++iloop) - { - const bool even_loop = (iloop % 2 == 0); - - Float* p_in_block_now = - even_loop ? p_in_block_double : p_in_block_double + in_block_space; - Float* p_wei_block_now = - even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space; - - Float* p_in_block_next = - even_loop ? p_in_block_double + in_block_space : p_in_block_double; - Float* p_wei_block_next = - even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - - Float p_in_register_clipboard[blockwise_in_copy_reorder - .GetRegisterClipboardSize()]; - Float - p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - p_in_global_block_offset += - CPerBlock * in_n_c_h_w_global_desc.GetStride(I1); - p_wei_global_block_offset += - CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0); - - __syncthreads(); - - // LDS doubel buffer: load next data from device mem - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); - - // LDS double buffer: GEMM on current data - run_blockwise_batch_gemm(p_wei_block_now, p_in_block_now, p_out_thread); - - // LDS double buffer: store next data to LDS - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_next); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_next); - } - } - - // LDS double buffer: tail - { - Float p_in_register_clipboard[blockwise_in_copy_reorder - .GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - // even iteration - p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1); - p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0); - - __syncthreads(); - - // LDS doubel buffer: load next data from device mem - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_register_clipboard); - - // LDS double buffer: GEMM on current data - run_blockwise_batch_gemm(p_wei_block_double, p_in_block_double, p_out_thread); - - // LDS double buffer: store next data to LDS - blockwise_in_copy_reorder.RunStoreRegisterClipboard( - p_in_register_clipboard, p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard( - p_wei_register_clipboard, p_wei_block_double + wei_block_space); - - // odd iteration - __syncthreads(); - - // LDS double buffer: GEMM on current data - run_blockwise_batch_gemm(p_wei_block_double + wei_block_space, - p_in_block_double + in_block_space, - p_out_thread); - } - } - } - - // output: register to global mem, - const auto c_thread_mtx_begin = - blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - - const index_t k_thread_data_begin = c_thread_mtx_begin.row; - const index_t ho_thread_data_begin = c_thread_mtx_begin.batch; - const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; - const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - - static_if{}([&](auto fwd) { // fwd do nothing but - // perfect forwarding. - // Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated - static_assert( - (fwd(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), - "wrong!"); - - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; - - constexpr index_t W2 = - (GemmNLevel0Cluster * GemmNLevel1Cluster) / fwd(NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; - - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; - -#if 0 - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); - - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); -#else - constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc) - .Fold(I3, Number{}, Number{}) - .Fold(I2, Number{}, Number{}) - .Fold(I0, Number{}, Number{}); - - constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc) - .Fold(I3, Number<1>{}, Number{}) - .Fold(I2, Number{}, Number<1>{}) - .Fold(I0, Number<1>{}, Number{}); -#endif - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); - - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - } -#endif - - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }).Else([&](auto fwd) { - static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0, - "wrong!"); - - // output is a 10d tensor - constexpr index_t N1 = NPerBlock; - - constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; - constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / fwd(W2 * W3); - - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; - -#if 0 - constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_packed( - Sequence{}); - - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_packed( - Sequence{}); -#else - constexpr auto out_10d_global_desc = - fwd(out_k_h_w_n_global_desc) - .Fold(I3, Number{}) - .Fold(I2, Number{}, Number{}, Number{}) - .Fold(I0, Number{}, Number{}); - - constexpr auto out_10d_thread_desc = - fwd(out_k_h_w_n_thread_desc) - .Fold(I3, Number{}) - .Fold(I2, Number{}, Number<1>{}, Number{}) - .Fold(I0, Number<1>{}, Number{}); -#endif - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); - - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - - for(index_t i = 0; i < 64; ++i) - { - printf("out %f, ", p_out_thread[i]); - } - } -#endif - - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }); - } -}; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp deleted file mode 100644 index 132f445e5a..0000000000 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp +++ /dev/null @@ -1,462 +0,0 @@ -#pragma once -#include "common.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMatrixDescriptor.hpp" -#include "blockwise_2d_tensor_op.hpp" -#include "blockwise_tensor_slice_op.hpp" -#include "threadwise_tensor_slice_op.hpp" -#include "threadwise_4d_tensor_op.hpp" -#include "blockwise_batched_gemm.hpp" - -template -struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn -{ - __device__ void Run(const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) const - { - // be careful of this assertion - static_assert( - NPerBlock % NPerThread == 0 && - ((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) || - (GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0)), - "wrong!"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; - constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; - constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{}; - - constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); - - constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0); - constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1); - constexpr index_t Wo = out_k_h_w_n_global_desc.GetLength(I2); - constexpr index_t N = out_k_h_w_n_global_desc.GetLength(I3); - - constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1); - constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2); - - // divide block work: [K, Ho, Wo, N] - static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && - Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, - "wrong! cannot evenly divide work for workgroup "); - - constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; - constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock; - constexpr index_t WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock; - constexpr index_t NBlockWork = (N + NPerBlock - 1) / NPerBlock; - - const index_t k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork); - index_t itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork); - const index_t h_block_work_id = itmp / (WBlockWork * NBlockWork); - itmp -= h_block_work_id * (WBlockWork * NBlockWork); - const index_t w_block_work_id = itmp / NBlockWork; - const index_t n_block_work_id = itmp - w_block_work_id * NBlockWork; - - const index_t k_block_data_begin = k_block_work_id * KPerBlock; - const index_t ho_block_data_begin = h_block_work_id * HoPerBlock; - const index_t wo_block_data_begin = w_block_work_id * WoPerBlock; - const index_t n_block_data_begin = n_block_work_id * NPerBlock; - - const index_t hi_block_data_begin = ho_block_data_begin; - const index_t wi_block_data_begin = wo_block_data_begin; - - // global tensor view - constexpr auto wei_c_k_global_desc = - make_ConstantTensorDescriptor(Sequence{}, Sequence{}); - - // LDS tensor view - // be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, - WeiBlockCopyDataPerRead_K, - GemmDataPerReadA, - GemmDataPerReadB); - - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); - - // this check is ad-hoc - // TODO: need to properly implement tensor descriptor with alignment - static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, - "GemmDataPerReadB alignment requirement is not meet"); - - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, - Number{}); - - // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); - - // blockwise copy - // input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N] - constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{}; - - const auto blockwise_in_copy_reorder = BlockwiseTensorSliceReorderCopy_v3< - BlockSize, - Float, - decltype(in_n_c_h_w_global_desc), - decltype(in_c_h_w_n_block_desc), - Sequence, - InBlockReorderSrcSubLengths_NCHW, - InBlockReorderSrcClusterLengths_NCHW, - decltype(map_chwn2nchw), - InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, - InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>{}; - - // blockwise wei copy - // format is [CPerBlock, KPerBlock] - const auto blockwise_wei_copy = - Blockwise2dTensorCopy3{}; - - // a series of blockwise batched GEMM - // C_matrix += transpose(A_matrix) * B_matrix - // A_matrix and B_matrix saved in LDS, C_matrix saved in register - // A_matrix[C,K] is a sub-matrix of wei_block[C,K] - // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] - // C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N] - constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto b_c_wn_block_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); - - constexpr auto c_k_wn_thread_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); - - const auto blockwise_batch_gemm = - BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2< - BlockSize, - decltype(a_c_k_block_mtx_desc), - decltype(b_c_wn_block_mtx_desc), - decltype(c_k_wn_thread_mtx_desc), - 0, - in_c_h_w_n_block_desc.GetStride(I1), - out_k_h_w_n_thread_desc.GetStride(I1), - HoPerBlock, - GemmMPerThreadSubC, - GemmNPerThreadSubC, - GemmMLevel0Cluster, - GemmNLevel0Cluster, - GemmMLevel1Cluster, - GemmNLevel1Cluster, - GemmKPerThreadLoop, - HoPerThread, - GemmDataPerReadA, - GemmDataPerReadB>{}; - - // choose GEMM implementation here - const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 0 - return blockwise_batch_gemm.Run(Xs...); -#elif 0 - return blockwise_batch_gemm.Run_asm(Xs...); -#else - return blockwise_batch_gemm.Run_asm_v2(Xs...); -#endif - }; - - // LDS: be careful of alignment - constexpr index_t in_block_space = - in_c_h_w_n_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number{}); - - __shared__ Float p_in_block[in_block_space]; - __shared__ Float p_wei_block[wei_block_space]; - - // register - // C++ lambda doesn't capture array, use pointer instead - Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()]; - Float* const p_out_thread = p_out_thread_data; - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc"); - print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc"); - - print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc"); - print_ConstantTensorDescriptor(wei_c_k_block_desc, "wei_c_k_block_desc"); - - printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space); - } -#endif - - // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); - -#if 1 - const Float* p_in_global_block_offset = - p_in_global + - in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( - n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); - - const Float* p_wei_global_block_offset = - p_wei_global + - wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); - - for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, - p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), - p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) - { - for(index_t y = 0; y < Y; ++y) - { - for(index_t x = 0; x < X; ++x) - { -#if 1 - blockwise_in_copy_reorder.Run( - p_in_global_block_offset + - in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x), - p_in_block); - - blockwise_wei_copy.Run( - p_wei_global_block_offset + - wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), - p_wei_block); -#else - Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; - Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy_reorder.RunLoadRegisterClipboard( - p_in_global_block_offset + - in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x), - p_in_clipboard); - - blockwise_wei_copy.RunLoadRegisterClipboard( - p_wei_global_block_offset + - wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), - p_wei_clipboard); - - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); - - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block); - -#endif - - __syncthreads(); - - run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread); - - __syncthreads(); - } - } - } -#else - for(index_t y = 0; y < Y; ++y) - { - for(index_t x = 0; x < X; ++x) - { - const Float* p_in_global_block_offset = - p_in_global + - in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( - n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x); - - const Float* p_wei_global_block_offset = - p_wei_global + - wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); - - for(index_t c_block_data_begin = 0; c_block_data_begin < C; - c_block_data_begin += CPerBlock, - p_in_global_block_offset += - CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), - p_wei_global_block_offset += - CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) - { -#if 0 - blockwise_in_copy_reorder.Run(p_in_global_block_offset, - p_in_block); - - blockwise_wei_copy.Run(p_wei_global_block_offset, - p_wei_block); -#else - Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; - Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_clipboard); - - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block); -#endif - - __syncthreads(); - - run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread); - - __syncthreads(); - } - } - } -#endif - - // output: register to global mem, - const auto c_thread_mtx_begin = - blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - - const index_t k_thread_data_begin = c_thread_mtx_begin.row; - const index_t ho_thread_data_begin = c_thread_mtx_begin.batch; - const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; - const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - - static_if{}([&](auto f_dummy) { // f_dummy do nothing but - // perfect forwarding. - // Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated - static_assert( - (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), - "wrong!"); - - // output is a 10d tensor - constexpr index_t N2 = GemmNPerThreadSubC; - constexpr index_t N1 = NPerBlock / N2; - - constexpr index_t W2 = - (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); - constexpr index_t W1 = WoPerBlock / W2; - - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; - - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); - - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); - - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - } -#endif - - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }).Else([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && - GemmNPerThreadSubC % NPerThread == 0, - "wrong!"); - - // output is a 10d tensor - constexpr index_t N1 = NPerBlock; - - constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; - constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); - - constexpr index_t K2 = GemmMPerThreadSubC; - constexpr index_t K1 = KPerBlock / KPerThread; - - constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( - Sequence{}); - - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); - -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); - - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - - for(index_t i = 0; i < 64; ++i) - { - printf("out %f, ", p_out_thread[i]); - } - } -#endif - - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }); - } -}; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp index 37e98faa77..4b13903459 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp @@ -1,13 +1,17 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_2d_tensor_op.hpp" -#include "blockwise_tensor_slice_op.hpp" -#include "threadwise_tensor_slice_op.hpp" -#include "threadwise_4d_tensor_op.hpp" +#include "blockwise_tensor_slice_copy.hpp" +#include "threadwise_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_op.hpp" #include "blockwise_batched_gemm.hpp" +namespace ck { + template {}); @@ -103,10 +107,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, - WeiBlockCopyDataPerRead_K, - GemmDataPerReadA, - GemmDataPerReadB); + constexpr index_t max_align = math::lcm(InBlockReorderDataPerWrite_N, + WeiBlockCopyDataPerRead_K, + GemmDataPerReadA, + GemmDataPerReadB); constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, @@ -119,7 +123,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( @@ -230,7 +234,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw #endif // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); + threadwise_generic_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); #if 0 const Float* p_in_global_block_offset = @@ -436,8 +440,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw wo_block_data_begin + wo_thread_data_begin), make_zero_array(), out_10d_thread_desc.GetLengths().ReorderGivenNew2Old(map_out_global2thread), - arithmetic_sequence_gen<0, 10, 1>::SeqType{}); + arithmetic_sequence_gen<0, 10, 1>::SeqType{}, + Number<1>{}); #endif }); } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp similarity index 94% rename from src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp rename to src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp index 3bd1f8cac7..885252cd38 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp @@ -1,13 +1,17 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_2d_tensor_op.hpp" -#include "blockwise_tensor_slice_op.hpp" -#include "threadwise_tensor_slice_op.hpp" -#include "threadwise_4d_tensor_op.hpp" +#include "blockwise_tensor_slice_copy.hpp" +#include "threadwise_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_op.hpp" #include "blockwise_batched_gemm.hpp" +namespace ck { + template -struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw +struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer { __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, @@ -81,10 +85,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, "wrong! cannot evenly divide work for workgroup "); - constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock); - constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock); - constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); - constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); + constexpr index_t NBlockWork = math::integer_divide_ceil(N, NPerBlock); + constexpr index_t KBlockWork = math::integer_divide_ceil(K, KPerBlock); + constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock); + constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock); constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed( Sequence{}); @@ -105,10 +109,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, - WeiBlockCopyDataPerRead_K, - GemmDataPerReadA, - GemmDataPerReadB); + constexpr index_t max_align = math::lcm(InBlockReorderDataPerWrite_N, + WeiBlockCopyDataPerRead_K, + GemmDataPerReadA, + GemmDataPerReadB); constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, @@ -121,7 +125,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( @@ -233,7 +237,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw #endif // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); + threadwise_generic_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); for(index_t y = 0; y < Y; ++y) { @@ -487,8 +491,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw wo_block_data_begin + wo_thread_data_begin), make_zero_array(), out_10d_thread_desc.GetLengths().ReorderGivenNew2Old(map_out_global2thread), - arithmetic_sequence_gen<0, 10, 1>::SeqType{}); + arithmetic_sequence_gen<0, 10, 1>::SeqType{}, + Number<1>{}); #endif }); } }; + +} // namespace +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp index dc3182949e..26c6b75fa9 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp @@ -1,4 +1,6 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" @@ -6,6 +8,8 @@ #include "blockwise_2d_tensor_op.hpp" #include "blockwise_gemm.hpp" +namespace ck { + // define B = flatten(N, Hi, Wi) template {}); @@ -275,3 +279,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn } } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp index a9d8a7f7cf..3241aabf62 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -1,12 +1,16 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_4d_tensor_op.hpp" #include "blockwise_2d_tensor_op.hpp" -#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" +namespace ck { + // define B = flatten(N, Hi, Wi) template {}); @@ -404,3 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer } } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp index 8e9a46753d..ab44486280 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp @@ -1,11 +1,15 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_op.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" +namespace ck { + // define B = merge(N0, Ho, Wo) template {}, - Number{}); + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor @@ -218,10 +222,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw }; // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2, - WeiBlockCopyDataPerAccess_K, - GemmDataPerReadA, - GemmDataPerReadB); + constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2, + WeiBlockCopyDataPerAccess_K, + GemmDataPerReadA, + GemmDataPerReadB); constexpr index_t in_block_space = in_c_n1_b_n2_block_mem_desc.GetElementSpace(Number{}); @@ -368,3 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw } } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp similarity index 96% rename from src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp rename to src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp index 92f4adeee1..2222e53abf 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp @@ -1,11 +1,15 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_op.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" +namespace ck { + // define B = merge(N0, Ho, Wo) template -struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw +struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer { __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, @@ -143,7 +147,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw // be careful of LDS alignment constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor @@ -215,10 +219,10 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw }; // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2, - WeiBlockCopyDataPerAccess_K, - GemmDataPerReadA, - GemmDataPerReadB); + constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2, + WeiBlockCopyDataPerAccess_K, + GemmDataPerReadA, + GemmDataPerReadB); constexpr index_t in_block_space = in_c_n1_b_n2_block_mem_desc.GetElementSpace(Number{}); @@ -395,3 +399,6 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw } } }; + +} // namesspace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp index cba229c8db..31067c8591 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp @@ -1,11 +1,15 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_op.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" -#include "threadwise_generic_tensor_slice_op.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" + +namespace ck { // define B = merge(N0, Ho, Wo) template {}, - Number{}); + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor @@ -248,10 +252,10 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw }; // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2, - WeiBlockCopyDstDataPerWrite_K, - GemmDataPerReadA, - GemmDataPerReadB); + constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2, + WeiBlockCopyDstDataPerWrite_K, + GemmDataPerReadA, + GemmDataPerReadB); constexpr index_t in_block_space = in_e_n1_b_n2_block_desc.GetElementSpace(Number{}); @@ -345,3 +349,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw } } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp similarity index 96% rename from src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp rename to src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp index db97aae039..9df6700d6d 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -1,11 +1,15 @@ -#pragma once +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER + #include "common.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_op.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" -#include "threadwise_generic_tensor_slice_op.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" + +namespace ck { // define B = merge(N0, Ho, Wo) template -struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw +struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer { __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, @@ -165,7 +169,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw // be careful of LDS alignment constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor @@ -237,10 +241,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw }; // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2, - WeiBlockCopyDstDataPerWrite_K, - GemmDataPerReadA, - GemmDataPerReadB); + constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2, + WeiBlockCopyDstDataPerWrite_K, + GemmDataPerReadA, + GemmDataPerReadB); constexpr index_t in_block_space = in_e_n1_b_n2_block_desc.GetElementSpace(Number{}); @@ -410,3 +414,6 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw } } }; + +} // namespace ck +#endif diff --git a/src/include/gridwise_convolution_kernel_wrapper.hpp b/src/include/gridwise_convolution_kernel_wrapper.hpp new file mode 100644 index 0000000000..a7caeed5aa --- /dev/null +++ b/src/include/gridwise_convolution_kernel_wrapper.hpp @@ -0,0 +1,16 @@ +#ifndef CK_GRIDWISE_CONVOLUTION_KERNEL_WRAPPER +#define CK_GRIDWISE_CONVOLUTION_KERNEL_WRAPPER + +namespace ck { + +template +__global__ void run_gridwise_convolution_kernel(const T* const __restrict__ p_in_global, + const T* const __restrict__ p_wei_global, + T* const __restrict__ p_out_global) +{ + GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global); +} + +} // namespace ck + +#endif diff --git a/src/include/gridwise_convolution_wrapper.hpp b/src/include/gridwise_convolution_wrapper.hpp deleted file mode 100644 index e4c535ada5..0000000000 --- a/src/include/gridwise_convolution_wrapper.hpp +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -template -__global__ void run_gridwise_convolution(const T* const __restrict__ p_in_global, - const T* const __restrict__ p_wei_global, - T* const __restrict__ p_out_global) -{ - GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global); -} diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index 80f568c3d6..f2552260bc 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -7,6 +7,8 @@ #include "threadwise_4d_tensor_op.hpp" #include "threadwise_direct_convolution.hpp" +namespace ck { + template struct integral_constant @@ -16,3 +19,6 @@ __host__ __device__ constexpr auto operator+(integral_constant, integral_c template using Number = integral_constant; + +} // namespace ck +#endif diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index e61ca1b432..ff537e44fe 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -1,4 +1,6 @@ -#pragma once +#ifndef CK_TENSOR_HPP +#define CK_TENSOR_HPP + #include #include #include @@ -266,3 +268,5 @@ struct Tensor TensorDescriptor mDesc; std::vector mData; }; + +#endif diff --git a/src/include/threadwise_4d_tensor_op.hpp b/src/include/threadwise_4d_tensor_op.hpp index ab209f4b13..9f8f0d12d6 100644 --- a/src/include/threadwise_4d_tensor_op.hpp +++ b/src/include/threadwise_4d_tensor_op.hpp @@ -1,6 +1,10 @@ -#pragma once +#ifndef CK_THREADWISE_4D_TENSOR_OP_HPP +#define CK_THREADWISE_4D_TENSOR_OP_HPP + #include "ConstantTensorDescriptor.hpp" +namespace ck { + template __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift) { @@ -50,3 +54,6 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi } } } + +} // namespace ck +#endif diff --git a/src/include/threadwise_direct_convolution.hpp b/src/include/threadwise_direct_convolution.hpp index a518c2d182..5d4d6376bb 100644 --- a/src/include/threadwise_direct_convolution.hpp +++ b/src/include/threadwise_direct_convolution.hpp @@ -1,6 +1,10 @@ -#pragma once +#ifndef CK_THREADWISE_DIRECT_CONVOLUTION_HPP +#define CK_THREADWISE_DIRECT_CONVOLUTION_HPP + #include "ConstantTensorDescriptor.hpp" -#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" + +namespace ck { // optimized for scenario if p_in, p_wei, p_out are in register template @@ -218,3 +222,6 @@ __device__ void threadwise_direct_convolution_3(InDesc, } #endif } + +} // namespace ck +#endif diff --git a/src/include/threadwise_gemm.hpp b/src/include/threadwise_gemm.hpp index d28e9e73ea..d79e2e9f9c 100644 --- a/src/include/threadwise_gemm.hpp +++ b/src/include/threadwise_gemm.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_THREADWISE_GEMM_HPP +#define CK_THREADWISE_GEMM_HPP + #include "common.hpp" #include "ConstantMatrixDescriptor.hpp" +namespace ck { + template __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread) { @@ -114,3 +118,6 @@ __device__ void threadwise_gemm(MatrixA, assert(false); } } + +} // namespace ck +#endif diff --git a/src/include/threadwise_generic_tensor_op.hpp b/src/include/threadwise_generic_tensor_op.hpp new file mode 100644 index 0000000000..e402255865 --- /dev/null +++ b/src/include/threadwise_generic_tensor_op.hpp @@ -0,0 +1,19 @@ +#ifndef CK_THREADWISE_GENERIC_TENSOR_OP_HPP +#define CK_THREADWISE_GENERIC_TENSOR_OP_HPP + +#include "ConstantTensorDescriptor.hpp" +#include "ConstantMergedTensorDescriptor.hpp" + +namespace ck { +template +__device__ void threadwise_generic_tensor_set_zero(TDesc, Float* __restrict__ p) +{ + static_ford{}([&](auto multi_id) { + constexpr index_t offset = TDesc::GetOffsetFromMultiIndex(multi_id); + + p[offset] = static_cast(0); + }); +} + +} // namespace ck +#endif diff --git a/src/include/threadwise_generic_tensor_slice_op.hpp b/src/include/threadwise_generic_tensor_slice_copy.hpp similarity index 96% rename from src/include/threadwise_generic_tensor_slice_op.hpp rename to src/include/threadwise_generic_tensor_slice_copy.hpp index 7ffed89f8e..9e7f90fe60 100644 --- a/src/include/threadwise_generic_tensor_slice_op.hpp +++ b/src/include/threadwise_generic_tensor_slice_copy.hpp @@ -1,7 +1,11 @@ -#pragma once +#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP +#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP + #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" +namespace ck { + template __device__ void threadwise_tensor_slice_copy(SrcDesc, @@ -192,3 +196,6 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc, }); }); } + +} // namespace ck +#endif diff --git a/src/include/base.hpp b/src/include/utility.hpp similarity index 94% rename from src/include/base.hpp rename to src/include/utility.hpp index dd6fc19b1f..c7b4ba337d 100644 --- a/src/include/base.hpp +++ b/src/include/utility.hpp @@ -1,4 +1,7 @@ -#pragma once +#ifndef CK_BASE_HPP +#define CK_BASE_HPP + +namespace ck { __device__ index_t get_thread_local_1d_id() { return threadIdx.x; } @@ -22,7 +25,7 @@ __host__ __device__ constexpr bool is_same_type(X, Y) return is_same::value; } -namespace mod_conv { // namespace mod_conv +namespace math { // namespace math template struct scales { @@ -110,4 +113,7 @@ __host__ __device__ constexpr T lcm(T x, Ts... xs) return max(x, xs...); } -} // namespace mod_conv +} // namespace math +} // namspace ck + +#endif diff --git a/src/include/vector_type.hpp b/src/include/vector_type.hpp index 3f5a3a10a8..72c73068e1 100644 --- a/src/include/vector_type.hpp +++ b/src/include/vector_type.hpp @@ -1,7 +1,11 @@ -#pragma once -#include "config.h" +#ifndef CK_VECTOR_TYPE_HPP +#define CK_VECTOR_TYPE_HPP + +#include "config.hpp" #include "integral_constant.hpp" +namespace ck { + template struct vector_type { @@ -184,3 +188,7 @@ struct vector_type using MemoryType = int64_t; }; #endif + +} // namespace ck + +#endif