mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
rename files, added header guard, added namespace
This commit is contained in:
@@ -1,9 +1,11 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
|
||||
const Tensor<T>& in,
|
||||
@@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
|
||||
WoPerThread,
|
||||
InBlockCopyDataPerRead,
|
||||
WeiBlockCopyDataPerRead>;
|
||||
float time = launch_kernel(run_gridwise_convolution<gridwise_conv, T>,
|
||||
float time = launch_kernel(run_gridwise_convolution_kernel<gridwise_conv, T>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
@@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
#elif 0
|
||||
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
#elif 1
|
||||
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
|
||||
#endif
|
||||
<GridSize,
|
||||
BlockSize,
|
||||
@@ -509,7 +511,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
OutThreadCopyDataPerWrite_N>{};
|
||||
|
||||
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
|
||||
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
|
||||
@@ -1,282 +0,0 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
|
||||
const Tensor<T>& in_nchw,
|
||||
WeiDesc,
|
||||
const Tensor<T>& wei_kcyx,
|
||||
OutDesc,
|
||||
Tensor<T>& out_nkhw,
|
||||
index_t nrepeat)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_nchw_desc = InDesc{};
|
||||
constexpr auto wei_kcyx_desc = WeiDesc{};
|
||||
constexpr auto out_nkhw_desc = OutDesc{};
|
||||
|
||||
constexpr index_t Hi = in_nchw_desc.GetLength(I2);
|
||||
constexpr index_t Wi = in_nchw_desc.GetLength(I3);
|
||||
|
||||
constexpr index_t N = out_nkhw_desc.GetLength(I0);
|
||||
constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
|
||||
constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
|
||||
|
||||
constexpr index_t K = wei_kcyx_desc.GetLength(I0);
|
||||
constexpr index_t C = wei_kcyx_desc.GetLength(I1);
|
||||
constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
|
||||
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
|
||||
|
||||
// reorder weight
|
||||
auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
|
||||
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
|
||||
|
||||
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
|
||||
|
||||
auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
|
||||
wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
|
||||
std::thread::hardware_concurrency());
|
||||
|
||||
// output
|
||||
auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
|
||||
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
|
||||
|
||||
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
|
||||
|
||||
std::size_t data_sz = sizeof(T);
|
||||
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
|
||||
DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
|
||||
DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
|
||||
|
||||
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
|
||||
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
|
||||
out_khwn_device_buf.ToDevice(out_khwn.mData.data());
|
||||
|
||||
#if 1
|
||||
// for 3x3, 34x34, v1r3, Pascal
|
||||
constexpr index_t BlockSize = 128;
|
||||
|
||||
constexpr index_t NPerBlock = 2;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
constexpr index_t CPerBlock = 8;
|
||||
constexpr index_t HoPerBlock = 2;
|
||||
constexpr index_t WoPerBlock = 16;
|
||||
|
||||
constexpr index_t NPerThread = 2;
|
||||
constexpr index_t KPerThread = 8;
|
||||
constexpr index_t HoPerThread = 1;
|
||||
constexpr index_t WoPerThread = 4;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
constexpr index_t GemmMLevel0Cluster = 4;
|
||||
constexpr index_t GemmNLevel0Cluster = 2;
|
||||
constexpr index_t GemmMLevel1Cluster = 4;
|
||||
constexpr index_t GemmNLevel1Cluster = 2;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>;
|
||||
using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>;
|
||||
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
|
||||
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
|
||||
constexpr index_t InBlockReorderDataPerWrite_N = 2;
|
||||
|
||||
using WeiBlockCopyClusterLengths = void;
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
#elif 1
|
||||
// for 3x3, 34x34, v1r3, Vega 20
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
constexpr index_t NPerBlock = 2;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
constexpr index_t CPerBlock = 8;
|
||||
constexpr index_t HoPerBlock = 4;
|
||||
constexpr index_t WoPerBlock = 16;
|
||||
|
||||
constexpr index_t NPerThread = 2;
|
||||
constexpr index_t KPerThread = 8;
|
||||
constexpr index_t HoPerThread = 1;
|
||||
constexpr index_t WoPerThread = 4;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
constexpr index_t GemmMLevel0Cluster = 4;
|
||||
constexpr index_t GemmNLevel0Cluster = 2;
|
||||
constexpr index_t GemmMLevel1Cluster = 4;
|
||||
constexpr index_t GemmNLevel1Cluster = 2;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>;
|
||||
using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 2, 16>;
|
||||
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
|
||||
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
|
||||
constexpr index_t InBlockReorderDataPerWrite_N = 2;
|
||||
|
||||
using WeiBlockCopyClusterLengths = void;
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
#elif 0
|
||||
// for 3x3, 28x28, v1r2, Pascal
|
||||
constexpr index_t BlockSize = 128;
|
||||
|
||||
constexpr index_t NPerBlock = 16;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
constexpr index_t CPerBlock = 8;
|
||||
constexpr index_t HoPerBlock = 2;
|
||||
constexpr index_t WoPerBlock = 2;
|
||||
|
||||
constexpr index_t NPerThread = 4;
|
||||
constexpr index_t KPerThread = 8;
|
||||
constexpr index_t HoPerThread = 1;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
constexpr index_t GemmMLevel0Cluster = 4;
|
||||
constexpr index_t GemmNLevel0Cluster = 2;
|
||||
constexpr index_t GemmMLevel1Cluster = 4;
|
||||
constexpr index_t GemmNLevel1Cluster = 2;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 2>;
|
||||
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
|
||||
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
|
||||
constexpr index_t InBlockReorderDataPerRead_W = 2;
|
||||
constexpr index_t InBlockReorderDataPerWrite_N = 4;
|
||||
|
||||
using WeiBlockCopyClusterLengths = Sequence<4, 1, 32>;
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
#elif 0
|
||||
// for 3x3, 28x28, v1r3, Pascal, bad
|
||||
constexpr index_t BlockSize = 128;
|
||||
|
||||
constexpr index_t NPerBlock = 16;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
constexpr index_t CPerBlock = 8;
|
||||
constexpr index_t HoPerBlock = 2;
|
||||
constexpr index_t WoPerBlock = 2;
|
||||
|
||||
constexpr index_t NPerThread = 4;
|
||||
constexpr index_t KPerThread = 8;
|
||||
constexpr index_t HoPerThread = 1;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
constexpr index_t GemmMLevel0Cluster = 4;
|
||||
constexpr index_t GemmNLevel0Cluster = 2;
|
||||
constexpr index_t GemmMLevel1Cluster = 4;
|
||||
constexpr index_t GemmNLevel1Cluster = 2;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>;
|
||||
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
|
||||
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
|
||||
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
|
||||
constexpr index_t InBlockReorderDataPerWrite_N = 1;
|
||||
|
||||
using WeiBlockCopyClusterLengths = void;
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
#endif
|
||||
|
||||
constexpr index_t GridSize =
|
||||
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
|
||||
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
for(index_t i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
constexpr auto gridwise_conv =
|
||||
#if 0
|
||||
GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
|
||||
#elif 0
|
||||
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
|
||||
#elif 1
|
||||
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
#endif
|
||||
<GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
decltype(in_nchw_desc),
|
||||
decltype(wei_cyxk_desc),
|
||||
decltype(out_khwn_desc),
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
GemmNLevel0Cluster,
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB,
|
||||
InBlockReorderSrcSubLengths_NCHW,
|
||||
InBlockReorderSrcClusterLengths_NCHW,
|
||||
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
|
||||
InBlockReorderDataPerRead_W,
|
||||
InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyClusterLengths,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
OutThreadCopyDataPerWrite_N>{};
|
||||
|
||||
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms, %f TFlop/s\n",
|
||||
time,
|
||||
(float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
|
||||
(std::size_t(1000) * 1000 * 1000) / time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
}
|
||||
|
||||
out_khwn_device_buf.FromDevice(out_khwn.mData.data());
|
||||
|
||||
// reorder output
|
||||
auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
|
||||
out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
@@ -1,9 +1,11 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
@@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
for(index_t i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
constexpr auto gridwise_conv =
|
||||
#if 1
|
||||
#if 0
|
||||
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
#else
|
||||
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
|
||||
#endif
|
||||
<GridSize,
|
||||
BlockSize,
|
||||
@@ -351,7 +353,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
OutThreadCopyDataPerWrite_W>{};
|
||||
|
||||
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
|
||||
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
|
||||
const Tensor<T>& in_nchw,
|
||||
@@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
|
||||
WeiBlockCopyDataPerRead,
|
||||
OutThreadCopyDataPerWrite>{};
|
||||
|
||||
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
|
||||
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
|
||||
@@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
|
||||
#if 0
|
||||
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
#else
|
||||
GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
|
||||
#endif
|
||||
<GridSize,
|
||||
BlockSize,
|
||||
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
|
||||
WeiBlockCopyDataPerAccess_K>{};
|
||||
|
||||
#if 1
|
||||
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
|
||||
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
|
||||
@@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
|
||||
#if 0
|
||||
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
#else
|
||||
GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
#endif
|
||||
<GridSize,
|
||||
BlockSize,
|
||||
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
|
||||
WeiBlockCopySrcDataPerRead_E,
|
||||
WeiBlockCopyDstDataPerWrite_K>{};
|
||||
|
||||
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
|
||||
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
#include "device.hpp"
|
||||
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
|
||||
const Tensor<TInWei>& in_nchw,
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
#include "device.hpp"
|
||||
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
|
||||
void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
|
||||
const Tensor<T>& in_nchw,
|
||||
|
||||
@@ -3,19 +3,19 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include "config.h"
|
||||
#include "config.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
|
||||
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
|
||||
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
|
||||
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
struct GeneratorTensor_1
|
||||
{
|
||||
template <class... Is>
|
||||
@@ -419,7 +419,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 3x3, 34x34
|
||||
constexpr index_t N = 64;
|
||||
constexpr index_t C = 256;
|
||||
@@ -633,15 +633,9 @@ int main(int argc, char* argv[])
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
device_direct_convolution_1
|
||||
#elif 0
|
||||
device_convolution_direct_v2_nchw_kcyx_nkhw
|
||||
#elif 0
|
||||
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
|
||||
#elif 0
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
configure_file("${PROJECT_SOURCE_DIR}/src/include/config.h.in" "${PROJECT_BINARY_DIR}/src/include/config.h")
|
||||
configure_file("${PROJECT_SOURCE_DIR}/src/include/config.hpp.in" "${PROJECT_BINARY_DIR}/src/include/config.hpp")
|
||||
|
||||
set(TENSOR_SOURCE
|
||||
tensor.cpp;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "config.h"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
|
||||
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_ARRAY_HPP
|
||||
#define CK_ARRAY_HPP
|
||||
|
||||
#include "Sequence.hpp"
|
||||
#include "functional2.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class TData, index_t NSize>
|
||||
struct Array
|
||||
{
|
||||
@@ -96,7 +100,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData
|
||||
|
||||
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
|
||||
|
||||
return Array<TData, NSize>{old_array.mSize[IRs]...};
|
||||
return Array<TData, NSize>{old_array[IRs]...};
|
||||
}
|
||||
|
||||
template <class TData, index_t NSize, class MapOld2New>
|
||||
@@ -180,7 +184,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
|
||||
{
|
||||
Array<TData, NSize> result;
|
||||
|
||||
auto f = mod_conv::plus<index_t>{};
|
||||
auto f = math::plus<index_t>{};
|
||||
|
||||
static_for<0, NSize, 1>{}(
|
||||
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
|
||||
@@ -195,7 +199,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
|
||||
{
|
||||
Array<TData, NSize> result;
|
||||
|
||||
auto f = mod_conv::minus<index_t>{};
|
||||
auto f = math::minus<index_t>{};
|
||||
|
||||
static_for<0, NSize, 1>{}(
|
||||
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
|
||||
@@ -212,7 +216,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
|
||||
|
||||
Array<TData, NSize> result;
|
||||
|
||||
auto f = mod_conv::plus<index_t>{};
|
||||
auto f = math::plus<index_t>{};
|
||||
|
||||
static_for<0, NSize, 1>{}(
|
||||
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
|
||||
@@ -229,7 +233,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
|
||||
|
||||
Array<TData, NSize> result;
|
||||
|
||||
auto f = mod_conv::minus<index_t>{};
|
||||
auto f = math::minus<index_t>{};
|
||||
|
||||
static_for<0, NSize, 1>{}(
|
||||
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
|
||||
@@ -246,7 +250,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
|
||||
|
||||
Array<TData, NSize> result;
|
||||
|
||||
auto f = mod_conv::multiplies<index_t>{};
|
||||
auto f = math::multiplies<index_t>{};
|
||||
|
||||
static_for<0, NSize, 1>{}(
|
||||
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
|
||||
@@ -263,7 +267,7 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
|
||||
|
||||
Array<TData, NSize> result;
|
||||
|
||||
auto f = mod_conv::minus<index_t>{};
|
||||
auto f = math::minus<index_t>{};
|
||||
|
||||
static_for<0, NSize, 1>{}(
|
||||
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
|
||||
@@ -368,3 +372,6 @@ __host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
|
||||
a[9]);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#pragma once
|
||||
#ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
|
||||
#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t NRow_, index_t NCol_, index_t RowStride_>
|
||||
struct ConstantMatrixDescriptor
|
||||
{
|
||||
@@ -57,3 +61,7 @@ __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
|
||||
|
||||
printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride());
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
|
||||
#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// OriginalTensorDesc : ConstantTensorDescriptor<...>
|
||||
// it's the tensor whose dimensions are to be merged
|
||||
// OriginalDimMergeSeqs : Sequence<...>...
|
||||
@@ -184,3 +188,6 @@ __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDe
|
||||
{
|
||||
print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor());
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
#pragma once
|
||||
#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
|
||||
#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class Lengths>
|
||||
__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
|
||||
{
|
||||
return reverse_inclusive_scan_sequence(
|
||||
Lengths{}.PopFront(), mod_conv::multiplies<index_t>{}, Number<1>{})
|
||||
Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
|
||||
.PushBack(Number<1>{});
|
||||
}
|
||||
|
||||
@@ -13,7 +17,7 @@ template <class Lengths, index_t Align>
|
||||
__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
|
||||
{
|
||||
constexpr index_t L_back_align =
|
||||
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
|
||||
Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
|
||||
|
||||
return calculate_tensor_strides_packed(
|
||||
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
|
||||
@@ -100,7 +104,7 @@ struct ConstantTensorDescriptor
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSize()
|
||||
{
|
||||
return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
return accumulate_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{});
|
||||
}
|
||||
|
||||
template <class Align = Number<1>>
|
||||
@@ -109,7 +113,7 @@ struct ConstantTensorDescriptor
|
||||
// This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
|
||||
// dimension
|
||||
constexpr index_t element_space_unaligned = accumulate_on_sequence(
|
||||
(GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{});
|
||||
(GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
|
||||
|
||||
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
|
||||
}
|
||||
@@ -161,8 +165,7 @@ struct ConstantTensorDescriptor
|
||||
|
||||
constexpr auto multi_id = Sequence<Is...>{};
|
||||
|
||||
return accumulate_on_sequence(
|
||||
multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
|
||||
return accumulate_on_sequence(multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{});
|
||||
}
|
||||
|
||||
// emulate constexpr lambda
|
||||
@@ -323,7 +326,7 @@ struct ConstantTensorDescriptor
|
||||
constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
|
||||
|
||||
constexpr index_t fold_intervals_product =
|
||||
accumulate_on_sequence(fold_intervals, mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
constexpr auto unfold_length = GetLength(Number<IDim>{});
|
||||
constexpr auto unfold_stride = GetStride(Number<IDim>{});
|
||||
@@ -341,7 +344,7 @@ struct ConstantTensorDescriptor
|
||||
constexpr auto fold_strides =
|
||||
Number<unfold_stride>{} *
|
||||
reverse_inclusive_scan_sequence(
|
||||
fold_intervals.PushBack(Number<1>{}), mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
fold_intervals.PushBack(Number<1>{}), math::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
// left and right
|
||||
constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{};
|
||||
@@ -376,7 +379,7 @@ struct ConstantTensorDescriptor
|
||||
|
||||
// unfolded length, stride
|
||||
constexpr index_t unfold_length = accumulate_on_sequence(
|
||||
GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
|
||||
|
||||
@@ -511,3 +514,6 @@ print_ConstantTensorDescriptor(const char* s,
|
||||
Strides...);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_SEQUENCE_HPP
|
||||
#define CK_SEQUENCE_HPP
|
||||
|
||||
#include "integral_constant.hpp"
|
||||
#include "functional.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class Seq>
|
||||
struct is_valid_sequence_map;
|
||||
|
||||
@@ -547,3 +551,6 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
|
||||
static_if<nsize == 10>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
#ifndef CK_AMD_INLINE_ASM_HPP
|
||||
#define CK_AMD_INLINE_ASM_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#define NO_VM_WAIT 0
|
||||
@@ -7,6 +9,8 @@
|
||||
#define NO_DS_WRITE 0
|
||||
#define NO_GLB_READ 0
|
||||
|
||||
namespace ck {
|
||||
|
||||
// cast a pointer of LDS to its address
|
||||
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
|
||||
|
||||
@@ -759,3 +763,6 @@ ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t off
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_BLOCKWISE_2D_TENSOR_OP_HPP
|
||||
#define CK_BLOCKWISE_2D_TENSOR_OP_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize, class Float, class DstDesc, class F>
|
||||
__device__ void
|
||||
blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
|
||||
@@ -192,7 +196,7 @@ struct Blockwise2dTensorCopy1
|
||||
// but we need to make sure dst stride0 is big enough,
|
||||
// so that the out-of-bound write won't contaminate next line in dst
|
||||
constexpr index_t L1 = CopyLengths{}.Get(I1);
|
||||
constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead);
|
||||
constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead);
|
||||
|
||||
static_assert(read_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0),
|
||||
"wrong! out-of-bound write will contaminate next line!\n");
|
||||
@@ -209,7 +213,7 @@ struct Blockwise2dTensorCopy1
|
||||
constexpr index_t L0 = CopyLengths{}.Get(I0);
|
||||
constexpr index_t L1 = CopyLengths{}.Get(I1);
|
||||
|
||||
constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead);
|
||||
constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead);
|
||||
|
||||
constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, read_per_d1>{});
|
||||
|
||||
@@ -676,7 +680,7 @@ struct Blockwise2dTensorCopy3
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_AMD_INLINE_ASM
|
||||
#if CK_USE_AMD_INLINE_ASM
|
||||
__device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
|
||||
Float* p_clipboard) const
|
||||
{
|
||||
@@ -796,3 +800,7 @@ struct Blockwise2dTensorCopy3
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_BLOCKWISE_3D_TENSOR_OP_HPP
|
||||
#define CK_BLOCKWISE_3D_TENSOR_OP_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize,
|
||||
class Float,
|
||||
class SrcDesc,
|
||||
@@ -33,7 +37,7 @@ struct Blockwise3dTensorCopy1
|
||||
// but we need to make sure dst stride2 is big enough,
|
||||
// so that the out-of-bound write won't contaminate next line in dst
|
||||
constexpr index_t L2 = CopyLengths{}.Get(I2);
|
||||
constexpr index_t read_per_d2 = mod_conv::integer_divide_ceil(L2, DataPerRead);
|
||||
constexpr index_t read_per_d2 = math::integer_divide_ceil(L2, DataPerRead);
|
||||
|
||||
static_assert(read_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
|
||||
"wrong! out-of-bound write will contaminate next line!\n");
|
||||
@@ -52,7 +56,7 @@ struct Blockwise3dTensorCopy1
|
||||
constexpr index_t L1 = CopyLengths{}.Get(I1);
|
||||
constexpr index_t L2 = CopyLengths{}.Get(I2);
|
||||
|
||||
constexpr index_t read_per_d2 = mod_conv::integer_divide_ceil(L2, DataPerRead);
|
||||
constexpr index_t read_per_d2 = math::integer_divide_ceil(L2, DataPerRead);
|
||||
|
||||
constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, L1, read_per_d2>{});
|
||||
|
||||
@@ -146,7 +150,7 @@ struct Blockwise3dTensorCopy3
|
||||
// we allow out-of-bound read from src in D2 dimension,
|
||||
// but we need to make sure dst stride is big enough,
|
||||
// so that the out-of-bound write won't contaminate next line in dst
|
||||
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
|
||||
static_assert(nloop_d2 * thread_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
|
||||
"wrong! out-of-bound write will contaminate next line!\n");
|
||||
@@ -158,7 +162,7 @@ struct Blockwise3dTensorCopy3
|
||||
"wrrong! BlockSize is not big enough for ThreadPerDims!");
|
||||
|
||||
constexpr index_t num_active_thread =
|
||||
accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
if(BlockSize > num_active_thread)
|
||||
{
|
||||
@@ -205,7 +209,7 @@ struct Blockwise3dTensorCopy3
|
||||
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
constexpr index_t nloop_d1 = L1 / thread_per_d1;
|
||||
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
|
||||
#pragma unroll
|
||||
for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0)
|
||||
@@ -251,7 +255,7 @@ struct Blockwise3dTensorCopy3
|
||||
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
constexpr index_t nloop_d1 = L1 / thread_per_d1;
|
||||
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
|
||||
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
|
||||
}
|
||||
@@ -283,7 +287,7 @@ struct Blockwise3dTensorCopy3
|
||||
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
constexpr index_t nloop_d1 = L1 / thread_per_d1;
|
||||
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
|
||||
constexpr auto clipboard_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{});
|
||||
@@ -339,7 +343,7 @@ struct Blockwise3dTensorCopy3
|
||||
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
constexpr index_t nloop_d1 = L1 / thread_per_d1;
|
||||
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
|
||||
|
||||
constexpr auto clipboard_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{});
|
||||
@@ -368,3 +372,7 @@ struct Blockwise3dTensorCopy3
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#pragma once
|
||||
#ifndef CK_BLOCKWISE_4D_TENSOR_OP_HPP
|
||||
#define CK_BLOCKWISE_4D_TENSOR_OP_HPP
|
||||
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize, class Float, class DstDesc, class F>
|
||||
__device__ void
|
||||
@@ -235,7 +239,7 @@ struct Blockwise4dTensorCopy1
|
||||
// but we need to make sure dst stride2 is big enough,
|
||||
// so that the out-of-bound write won't contaminate next line in dst
|
||||
constexpr index_t L3 = CopyLengths{}.Get(I3);
|
||||
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
|
||||
constexpr index_t read_per_d3 = math::integer_divide_ceil(L3, DataPerRead);
|
||||
|
||||
static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2),
|
||||
"wrong! out-of-bound write will contaminate next line!\n");
|
||||
@@ -256,7 +260,7 @@ struct Blockwise4dTensorCopy1
|
||||
constexpr index_t L2 = CopyLengths{}.Get(I2);
|
||||
constexpr index_t L3 = CopyLengths{}.Get(I3);
|
||||
|
||||
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
|
||||
constexpr index_t read_per_d3 = math::integer_divide_ceil(L3, DataPerRead);
|
||||
|
||||
constexpr auto ref_desc =
|
||||
make_ConstantTensorDescriptor_packed(Sequence<L0, L1, L2, read_per_d3>{});
|
||||
@@ -488,7 +492,7 @@ struct Blockwise4dTensorCopy3
|
||||
// we allow out-of-bound read from src in D3 dimension,
|
||||
// but we need to make sure dst stride is big enough,
|
||||
// so that the out-of-bound write won't contaminate next line in dst
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
static_assert(nloop_d3 * thread_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2),
|
||||
"wrong! out-of-bound write will contaminate next line!\n");
|
||||
@@ -500,7 +504,7 @@ struct Blockwise4dTensorCopy3
|
||||
"wrrong! BlockSize is not big enough for ThreadPerDims!");
|
||||
|
||||
constexpr index_t num_active_thread =
|
||||
accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
if(BlockSize > num_active_thread)
|
||||
{
|
||||
@@ -556,7 +560,7 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
constexpr index_t nloop_d1 = L1 / thread_per_d1;
|
||||
constexpr index_t nloop_d2 = L2 / thread_per_d2;
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
#pragma unroll
|
||||
for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0)
|
||||
@@ -613,7 +617,7 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
constexpr index_t nloop_d1 = L1 / thread_per_d1;
|
||||
constexpr index_t nloop_d2 = L2 / thread_per_d2;
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3;
|
||||
}
|
||||
@@ -650,7 +654,7 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
constexpr index_t nloop_d1 = L1 / thread_per_d1;
|
||||
constexpr index_t nloop_d2 = L2 / thread_per_d2;
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
|
||||
@@ -717,7 +721,7 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
constexpr index_t nloop_d1 = L1 / thread_per_d1;
|
||||
constexpr index_t nloop_d2 = L2 / thread_per_d2;
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
constexpr index_t nloop_d3 = math::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
|
||||
@@ -768,3 +772,7 @@ struct Blockwise4dTensorCopyReorder1
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#pragma once
|
||||
#ifndef CK_BLOCKWISE_BATCHED_GEMM_HPP
|
||||
#define CK_BLOCKWISE_BATCHED_GEMM_HPP
|
||||
|
||||
#include "threadwise_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize,
|
||||
class BlockMatrixA,
|
||||
class BlockMatrixB,
|
||||
@@ -287,7 +291,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_AMD_INLINE_ASM
|
||||
#if CK_USE_AMD_INLINE_ASM
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run_asm(const FloatA* __restrict__ p_a_block,
|
||||
const FloatB* __restrict__ p_b_block,
|
||||
@@ -518,3 +522,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_BLOCKWISE_GEMM_HPP
|
||||
#define CK_BLOCKWISE_GEMM_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
#include "threadwise_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// if following number are power of 2, index calculation shall be greatly reduced:
|
||||
// MPerThreadSubC, NPerThreadSubC, MLevel0Cluster, NLevel0Cluster, MLevel1Cluster, NLevel1Cluster
|
||||
template <index_t BlockSize,
|
||||
@@ -109,7 +113,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
n_repeat * NPerLevel1Cluster + n_in_sub_c};
|
||||
}
|
||||
|
||||
#if USE_AMD_INLINE_ASM
|
||||
#if CK_USE_AMD_INLINE_ASM
|
||||
// TODO: this is not working correctly
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run_asm(const FloatA* __restrict__ p_a_block,
|
||||
@@ -423,3 +427,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
#pragma once
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
|
||||
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
|
||||
|
||||
#include "threadwise_generic_tensor_slice_copy.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst
|
||||
@@ -142,10 +146,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
// complete offset
|
||||
mThreadSrcOffset = accumulate_on_array(
|
||||
mThreadSrcPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
|
||||
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
|
||||
mThreadDstOffset = accumulate_on_array(
|
||||
mThreadDstPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
|
||||
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
|
||||
#if 0
|
||||
if(get_block_1d_id() == 0)
|
||||
@@ -388,3 +392,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,9 @@
|
||||
#pragma once
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#ifndef CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP
|
||||
#define CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP
|
||||
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t BlockSize,
|
||||
class Float,
|
||||
@@ -120,7 +124,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
// compiler: will it really compute index here, or be merged with
|
||||
// GetOffsetFromMultiIndex and
|
||||
// optimized away???
|
||||
src_data_multi_id[i] *= src_sub_lengths.Get(I);
|
||||
src_data_multi_id(i) *= src_sub_lengths.Get(I);
|
||||
});
|
||||
|
||||
// compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
|
||||
@@ -167,10 +171,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto src_data_per_cluster_per_dims =
|
||||
thread_sub_tensor_lengths * SrcClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths =
|
||||
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
|
||||
SrcLengths{},
|
||||
src_data_per_cluster_per_dims);
|
||||
constexpr auto repeat_lengths = transform_sequences(
|
||||
math::integer_divide_ceiler<index_t>{}, SrcLengths{}, src_data_per_cluster_per_dims);
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
@@ -188,10 +190,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto src_data_per_cluster_per_dims =
|
||||
thread_sub_tensor_lengths * SrcClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths =
|
||||
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
|
||||
SrcLengths{},
|
||||
src_data_per_cluster_per_dims);
|
||||
constexpr auto repeat_lengths = transform_sequences(
|
||||
math::integer_divide_ceiler<index_t>{}, SrcLengths{}, src_data_per_cluster_per_dims);
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
@@ -226,10 +226,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto src_data_per_cluster_per_dims =
|
||||
thread_sub_tensor_lengths * SrcClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths =
|
||||
transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
|
||||
SrcLengths{},
|
||||
src_data_per_cluster_per_dims);
|
||||
constexpr auto repeat_lengths = transform_sequences(
|
||||
math::integer_divide_ceiler<index_t>{}, SrcLengths{}, src_data_per_cluster_per_dims);
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
@@ -294,3 +292,6 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
}).Else([&](auto fwd) { mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); });
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
#include "base.hpp"
|
||||
#ifndef CK_COMMON_HPP
|
||||
#define CK_COMMON_HPP
|
||||
|
||||
#include "utility.hpp"
|
||||
#include "vector_type.hpp"
|
||||
#include "integral_constant.hpp"
|
||||
#include "Sequence.hpp"
|
||||
@@ -8,6 +10,8 @@
|
||||
#include "functional2.hpp"
|
||||
#include "functional3.hpp"
|
||||
|
||||
#if USE_AMD_INLINE_ASM
|
||||
#if CK_USE_AMD_INLINE_ASM
|
||||
#include "amd_inline_asm.hpp"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,23 +1,30 @@
|
||||
#pragma once
|
||||
#ifndef CK_CONFIG_HPP
|
||||
#define CK_CONFIG_HPP
|
||||
|
||||
#cmakedefine01 DEVICE_BACKEND_HIP
|
||||
#cmakedefine01 DEVICE_BACKEND_CUDA
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#define USE_AMD_INLINE_ASM 1
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float float2_t __attribute__((ext_vector_type(2)));
|
||||
typedef float float4_t __attribute__((ext_vector_type(4)));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#include "cuda_runtime.h"
|
||||
#include "cuda_fp16.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "helper_cuda.h"
|
||||
#define USE_AMD_INLINE_ASM 0
|
||||
#define CK_USE_AMD_INLINE_ASM 0
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float float2_t __attribute__((ext_vector_type(2)));
|
||||
typedef float float4_t __attribute__((ext_vector_type(4)));
|
||||
#else
|
||||
// For some reason, CUDA need this definition, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct
|
||||
@@ -58,3 +65,7 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,6 +1,10 @@
|
||||
#pragma once
|
||||
#ifndef CK_CONV_COMMON_HPP
|
||||
#define CK_CONV_COMMON_HPP
|
||||
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <class InDesc, class WeiDesc>
|
||||
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
|
||||
@@ -117,3 +121,5 @@ constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc,
|
||||
return sizeof(Float) *
|
||||
(InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#pragma once
|
||||
#ifndef CK_DEVICE_HPP
|
||||
#define CK_DEVICE_HPP
|
||||
|
||||
#include <memory>
|
||||
#include "config.h"
|
||||
#include "config.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
struct DeviceMem
|
||||
{
|
||||
@@ -56,3 +60,5 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
|
||||
|
||||
return timer.GetElapsedTime();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_FUNCTIONAL_HPP
|
||||
#define CK_FUNCTIONAL_HPP
|
||||
|
||||
#include "integral_constant.hpp"
|
||||
#include "Sequence.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
struct forwarder
|
||||
{
|
||||
template <typename T>
|
||||
@@ -70,3 +74,6 @@ struct static_if<false>
|
||||
return Type{};
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_FUNCTIONAL2_HPP
|
||||
#define CK_FUNCTIONAL2_HPP
|
||||
|
||||
#include "functional.hpp"
|
||||
#include "Sequence.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class>
|
||||
struct static_for_impl;
|
||||
|
||||
@@ -59,3 +63,6 @@ accumulate_on_sequence(Seq, Reduce f, Number<Init> /*initial_value*/)
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
#pragma once
|
||||
#ifndef CK_FUNCTIONAL3_HPP
|
||||
#define CK_FUNCTIONAL3_HPP
|
||||
|
||||
#include "functional.hpp"
|
||||
#include "functional2.hpp"
|
||||
#include "Sequence.hpp"
|
||||
#include "Array.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// RemainLengths: Sequence<...>
|
||||
template <class RemainLengths>
|
||||
struct static_ford_impl
|
||||
@@ -107,3 +111,6 @@ struct ford
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW
|
||||
#define CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_direct_convolution.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
@@ -245,3 +249,6 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
|
||||
Number<1>{});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
@@ -103,10 +107,10 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
|
||||
|
||||
// tensor view of blockwise input and weight in LDS
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{},
|
||||
@@ -119,11 +123,11 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
|
||||
|
||||
constexpr auto wei_cyx_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock * Y * X, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
constexpr auto wei_c_y_x_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, Y, X, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
|
||||
@@ -390,3 +394,6 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,14 +1,18 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_3d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
@@ -104,10 +108,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WiPerBlock, NPerBlock>{},
|
||||
@@ -120,7 +124,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
|
||||
constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, X, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
|
||||
@@ -426,3 +430,6 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,352 +0,0 @@
|
||||
#pragma once
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_3d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
index_t NPerBlock,
|
||||
index_t KPerBlock,
|
||||
index_t CPerBlock,
|
||||
index_t HoPerBlock,
|
||||
index_t WoPerBlock,
|
||||
index_t NPerThread,
|
||||
index_t KPerThread,
|
||||
index_t HoPerThread,
|
||||
index_t WoPerThread,
|
||||
index_t GemmMPerThreadSubC,
|
||||
index_t GemmNPerThreadSubC,
|
||||
index_t GemmMLevel0Cluster,
|
||||
index_t GemmNLevel0Cluster,
|
||||
index_t GemmMLevel1Cluster,
|
||||
index_t GemmNLevel1Cluster,
|
||||
index_t GemmKPerThreadLoop,
|
||||
index_t GemmDataPerReadA,
|
||||
index_t GemmDataPerReadB,
|
||||
class InBlockReorderSrcSubLengths_NCHW,
|
||||
class InBlockReorderSrcClusterLengths_NCHW,
|
||||
class InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
|
||||
index_t InBlockReorderDataPerRead_W,
|
||||
index_t InBlockReorderDataPerWrite_N,
|
||||
class WeiBlockCopyClusterLengths_CXK,
|
||||
index_t WeiBlockCopyDataPerRead_K,
|
||||
index_t OutThreadCopyDataPerWrite_N>
|
||||
struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
{
|
||||
// be careful of this assertion
|
||||
static_assert(
|
||||
NPerBlock % NPerThread == 0 &&
|
||||
((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) ||
|
||||
(GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0)),
|
||||
"wrong!");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1);
|
||||
|
||||
constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1);
|
||||
constexpr index_t Wo = out_k_h_w_n_global_desc.GetLength(I2);
|
||||
constexpr index_t N = out_k_h_w_n_global_desc.GetLength(I3);
|
||||
|
||||
constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1);
|
||||
constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2);
|
||||
|
||||
constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
|
||||
constexpr index_t WiPerBlock = WoPerBlock + X - 1;
|
||||
|
||||
// divide block work: [K, Ho, Wo, N]
|
||||
static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 &&
|
||||
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
|
||||
"wrong! cannot evenly divide work for workgroup ");
|
||||
|
||||
constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
|
||||
constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock;
|
||||
constexpr index_t WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock;
|
||||
constexpr index_t NBlockWork = (N + NPerBlock - 1) / NPerBlock;
|
||||
|
||||
const index_t k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork);
|
||||
index_t itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork);
|
||||
const index_t h_block_work_id = itmp / (WBlockWork * NBlockWork);
|
||||
itmp -= h_block_work_id * (WBlockWork * NBlockWork);
|
||||
const index_t w_block_work_id = itmp / NBlockWork;
|
||||
const index_t n_block_work_id = itmp - w_block_work_id * NBlockWork;
|
||||
|
||||
const index_t k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const index_t ho_block_data_begin = h_block_work_id * HoPerBlock;
|
||||
const index_t wo_block_data_begin = w_block_work_id * WoPerBlock;
|
||||
const index_t n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
|
||||
const index_t hi_block_data_begin = ho_block_data_begin;
|
||||
const index_t wi_block_data_begin = wo_block_data_begin;
|
||||
|
||||
// global tensor view
|
||||
constexpr auto wei_c_x_k_global_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<C, X, K>{}, Sequence<Y * X * K, K, 1>{});
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WiPerBlock, NPerBlock>{}, Number<max_align>{});
|
||||
|
||||
constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, X, KPerBlock>{}, Number<max_align>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
|
||||
constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};
|
||||
|
||||
const auto blockwise_in_copy_reorder = BlockwiseTensorSliceReorderCopy_v3<
|
||||
BlockSize,
|
||||
Float,
|
||||
decltype(in_n_c_h_w_global_desc),
|
||||
decltype(in_c_h_w_n_block_desc),
|
||||
Sequence<NPerBlock, CPerBlock, HoPerBlock, WiPerBlock>,
|
||||
InBlockReorderSrcSubLengths_NCHW,
|
||||
InBlockReorderSrcClusterLengths_NCHW,
|
||||
decltype(map_chwn2nchw),
|
||||
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
|
||||
InBlockReorderDataPerRead_W,
|
||||
InBlockReorderDataPerWrite_N>{};
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock, X * KPerBlock]
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise3dTensorCopy3<BlockSize,
|
||||
Float,
|
||||
decltype(wei_c_x_k_global_desc),
|
||||
decltype(wei_c_x_k_block_desc),
|
||||
decltype(wei_c_x_k_block_desc.GetLengths()),
|
||||
WeiBlockCopyClusterLengths_CXK,
|
||||
WeiBlockCopyDataPerRead_K>{};
|
||||
|
||||
// a series of blockwise batched GEMM
|
||||
// C_matrix += transpose(A_matrix) * B_matrix
|
||||
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[C,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
|
||||
constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_c_x_k_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto b_c_wn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_c_h_w_n_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto c_k_wn_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{},
|
||||
Number<WoPerThread * NPerThread>{},
|
||||
Number<out_k_h_w_n_thread_desc.GetStride(I0)>{});
|
||||
|
||||
const auto blockwise_batch_gemm =
|
||||
BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2<
|
||||
BlockSize,
|
||||
decltype(a_c_k_block_mtx_desc),
|
||||
decltype(b_c_wn_block_mtx_desc),
|
||||
decltype(c_k_wn_thread_mtx_desc),
|
||||
0,
|
||||
in_c_h_w_n_block_desc.GetStride(I1),
|
||||
out_k_h_w_n_thread_desc.GetStride(I1),
|
||||
HoPerBlock,
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
GemmNLevel0Cluster,
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop,
|
||||
HoPerThread,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB>{};
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t in_block_space =
|
||||
in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
|
||||
constexpr index_t wei_block_space =
|
||||
wei_c_x_k_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
__shared__ Float p_in_block[in_block_space];
|
||||
__shared__ Float p_wei_block[wei_block_space];
|
||||
|
||||
// register
|
||||
Float p_out_thread[out_k_h_w_n_thread_desc.GetElementSpace()];
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_c_x_k_block_desc, "wei_c_x_k_block_desc");
|
||||
|
||||
printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space);
|
||||
}
|
||||
#endif
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
|
||||
|
||||
#if 0
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
|
||||
{
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
blockwise_in_copy_reorder.Run(p_in_global_block_offset +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
|
||||
p_in_block);
|
||||
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
|
||||
p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
|
||||
p_in_block +
|
||||
in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
#else
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin);
|
||||
|
||||
for(index_t
|
||||
c_block_data_begin = 0;
|
||||
c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
|
||||
{
|
||||
Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()];
|
||||
Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_clipboard);
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block);
|
||||
blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
blockwise_batch_gemm.Run(
|
||||
p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
|
||||
p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// output: register to global mem,
|
||||
const auto c_thread_mtx_begin =
|
||||
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
|
||||
const index_t k_thread_data_begin = c_thread_mtx_begin.row;
|
||||
const index_t ho_thread_data_begin = c_thread_mtx_begin.batch;
|
||||
const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock;
|
||||
const index_t n_thread_data_begin =
|
||||
c_thread_mtx_begin.col - NPerBlock * wo_thread_data_begin;
|
||||
|
||||
// output is a 10d tensor
|
||||
constexpr index_t N2 = GemmNPerThreadSubC;
|
||||
constexpr index_t N1 = NPerBlock / N2;
|
||||
|
||||
constexpr index_t W2 =
|
||||
(GemmNLevel0Cluster * GemmNLevel1Cluster) / (NPerBlock / GemmNPerThreadSubC);
|
||||
constexpr index_t W1 = WoPerBlock / W2;
|
||||
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2), W1, W2, N / (N1 * N2), N1, N2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, 1, 1, N2>{});
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(out_khwn_thread_desc, "out_khwn_thread_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_10d_tensor_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}
|
||||
};
|
||||
@@ -1,13 +1,17 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
@@ -74,10 +78,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
|
||||
"wrong! cannot evenly divide work for workgroup ");
|
||||
|
||||
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
|
||||
constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
constexpr index_t NBlockWork = math::integer_divide_ceil(N, NPerBlock);
|
||||
constexpr index_t KBlockWork = math::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
@@ -99,10 +103,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
@@ -115,7 +119,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
|
||||
@@ -416,3 +420,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
@@ -36,7 +40,7 @@ template <index_t GridSize,
|
||||
index_t InBlockCopyDataPerRead_N,
|
||||
index_t WeiBlockCopyDataPerRead_K,
|
||||
index_t OutThreadCopyDataPerWrite_N>
|
||||
struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
@@ -80,10 +84,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
|
||||
"wrong! cannot evenly divide work for workgroup ");
|
||||
|
||||
constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
|
||||
constexpr index_t KBlockWork = math::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock);
|
||||
constexpr index_t NBlockWork = math::integer_divide_ceil(N, NPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});
|
||||
@@ -104,10 +108,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
@@ -120,7 +124,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
|
||||
@@ -466,3 +470,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -1,501 +0,0 @@
|
||||
#pragma once
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
index_t NPerBlock,
|
||||
index_t KPerBlock,
|
||||
index_t CPerBlock,
|
||||
index_t HoPerBlock,
|
||||
index_t WoPerBlock,
|
||||
index_t NPerThread,
|
||||
index_t KPerThread,
|
||||
index_t HoPerThread,
|
||||
index_t WoPerThread,
|
||||
index_t GemmMPerThreadSubC,
|
||||
index_t GemmNPerThreadSubC,
|
||||
index_t GemmMLevel0Cluster,
|
||||
index_t GemmNLevel0Cluster,
|
||||
index_t GemmMLevel1Cluster,
|
||||
index_t GemmNLevel1Cluster,
|
||||
index_t GemmKPerThreadLoop,
|
||||
index_t GemmDataPerReadA,
|
||||
index_t GemmDataPerReadB,
|
||||
class InBlockReorderSrcSubLengths_NCHW,
|
||||
class InBlockReorderSrcClusterLengths_NCHW,
|
||||
class InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
|
||||
index_t InBlockReorderDataPerRead_W,
|
||||
index_t InBlockReorderDataPerWrite_N,
|
||||
class WeiBlockCopyClusterLengths_CK, // not used
|
||||
index_t WeiBlockCopyDataPerRead_K,
|
||||
index_t OutThreadCopyDataPerWrite_N>
|
||||
struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
{
|
||||
// be careful of this assertion
|
||||
static_assert(
|
||||
NPerBlock % NPerThread == 0 &&
|
||||
((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) ||
|
||||
(GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0)),
|
||||
"wrong!");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1);
|
||||
|
||||
constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1);
|
||||
constexpr index_t Wo = out_k_h_w_n_global_desc.GetLength(I2);
|
||||
constexpr index_t N = out_k_h_w_n_global_desc.GetLength(I3);
|
||||
|
||||
constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1);
|
||||
constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2);
|
||||
|
||||
// assert for LDS double buffer
|
||||
static_assert(C % (2 * CPerBlock) == 0, "C cannot be evenly divided");
|
||||
|
||||
// divide block work: [K, Ho, Wo, N]
|
||||
static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 &&
|
||||
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
|
||||
"wrong! cannot evenly divide work for workgroup ");
|
||||
|
||||
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
|
||||
constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id =
|
||||
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
|
||||
|
||||
const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock;
|
||||
const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock;
|
||||
const index_t ho_block_data_begin = block_work_multi_id[2] * HoPerBlock;
|
||||
const index_t wo_block_data_begin = block_work_multi_id[3] * WoPerBlock;
|
||||
|
||||
const index_t hi_block_data_begin = ho_block_data_begin;
|
||||
const index_t wi_block_data_begin = wo_block_data_begin;
|
||||
|
||||
// global tensor view
|
||||
constexpr auto wei_c_k_global_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
Number<InBlockReorderDataPerWrite_N>{});
|
||||
|
||||
// this check is ad-hoc
|
||||
// TODO: need to properly implement tensor descriptor with alignment
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
|
||||
constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};
|
||||
|
||||
const auto blockwise_in_copy_reorder = BlockwiseTensorSliceReorderCopy_v3<
|
||||
BlockSize,
|
||||
Float,
|
||||
decltype(in_n_c_h_w_global_desc),
|
||||
decltype(in_c_h_w_n_block_desc),
|
||||
Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>,
|
||||
InBlockReorderSrcSubLengths_NCHW,
|
||||
InBlockReorderSrcClusterLengths_NCHW,
|
||||
decltype(map_chwn2nchw),
|
||||
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
|
||||
InBlockReorderDataPerRead_W,
|
||||
InBlockReorderDataPerWrite_N>({0, 0, 0, 0}, {0, 0, 0, 0});
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock, KPerBlock]
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy3<BlockSize,
|
||||
Float,
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopyDataPerRead_K>{};
|
||||
|
||||
// a series of blockwise batched GEMM
|
||||
// C_matrix += transpose(A_matrix) * B_matrix
|
||||
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[C,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
|
||||
constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_c_k_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto b_c_wn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_c_h_w_n_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto c_k_wn_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{},
|
||||
Number<WoPerThread * NPerThread>{},
|
||||
Number<out_k_h_w_n_thread_desc.GetStride(I0)>{});
|
||||
|
||||
const auto blockwise_batch_gemm =
|
||||
BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2<
|
||||
BlockSize,
|
||||
decltype(a_c_k_block_mtx_desc),
|
||||
decltype(b_c_wn_block_mtx_desc),
|
||||
decltype(c_k_wn_thread_mtx_desc),
|
||||
0,
|
||||
in_c_h_w_n_block_desc.GetStride(I1),
|
||||
out_k_h_w_n_thread_desc.GetStride(I1),
|
||||
HoPerBlock,
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
GemmNLevel0Cluster,
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop,
|
||||
HoPerThread,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB>{};
|
||||
|
||||
// choose GEMM implementation here
|
||||
const auto run_blockwise_batch_gemm = [&](auto... Xs) {
|
||||
#if 1
|
||||
return blockwise_batch_gemm.Run(Xs...);
|
||||
#elif 0
|
||||
return blockwise_batch_gemm.Run_asm(Xs...);
|
||||
#else
|
||||
return blockwise_batch_gemm.Run_asm_v2(Xs...);
|
||||
#endif
|
||||
};
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t in_block_space =
|
||||
in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
|
||||
constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
// LDS double buffer
|
||||
__shared__ Float p_in_block_double[2 * in_block_space];
|
||||
__shared__ Float p_wei_block_double[2 * wei_block_space];
|
||||
|
||||
// register
|
||||
// C++ lambda doesn't capture array, use pointer instead
|
||||
Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()];
|
||||
Float* const p_out_thread = p_out_thread_data;
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_c_k_block_desc, "wei_c_k_block_desc");
|
||||
|
||||
printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space);
|
||||
}
|
||||
#endif
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
|
||||
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
|
||||
// LDS double buffer: preload data into LDS
|
||||
{
|
||||
Float p_in_register_clipboard[blockwise_in_copy_reorder
|
||||
.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_register_clipboard);
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
|
||||
blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_double);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_double);
|
||||
}
|
||||
|
||||
// LDS double buffer: main body
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin + 2 * CPerBlock < C;
|
||||
c_block_data_begin += 2 * CPerBlock)
|
||||
{
|
||||
#pragma unroll
|
||||
for(index_t iloop = 0; iloop < 2; ++iloop)
|
||||
{
|
||||
const bool even_loop = (iloop % 2 == 0);
|
||||
|
||||
Float* p_in_block_now =
|
||||
even_loop ? p_in_block_double : p_in_block_double + in_block_space;
|
||||
Float* p_wei_block_now =
|
||||
even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space;
|
||||
|
||||
Float* p_in_block_next =
|
||||
even_loop ? p_in_block_double + in_block_space : p_in_block_double;
|
||||
Float* p_wei_block_next =
|
||||
even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double;
|
||||
|
||||
Float p_in_register_clipboard[blockwise_in_copy_reorder
|
||||
.GetRegisterClipboardSize()];
|
||||
Float
|
||||
p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
p_in_global_block_offset +=
|
||||
CPerBlock * in_n_c_h_w_global_desc.GetStride(I1);
|
||||
p_wei_global_block_offset +=
|
||||
CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// LDS doubel buffer: load next data from device mem
|
||||
blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_register_clipboard);
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
|
||||
// LDS double buffer: GEMM on current data
|
||||
run_blockwise_batch_gemm(p_wei_block_now, p_in_block_now, p_out_thread);
|
||||
|
||||
// LDS double buffer: store next data to LDS
|
||||
blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_next);
|
||||
}
|
||||
}
|
||||
|
||||
// LDS double buffer: tail
|
||||
{
|
||||
Float p_in_register_clipboard[blockwise_in_copy_reorder
|
||||
.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
// even iteration
|
||||
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1);
|
||||
p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// LDS doubel buffer: load next data from device mem
|
||||
blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_register_clipboard);
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
|
||||
// LDS double buffer: GEMM on current data
|
||||
run_blockwise_batch_gemm(p_wei_block_double, p_in_block_double, p_out_thread);
|
||||
|
||||
// LDS double buffer: store next data to LDS
|
||||
blockwise_in_copy_reorder.RunStoreRegisterClipboard(
|
||||
p_in_register_clipboard, p_in_block_double + in_block_space);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(
|
||||
p_wei_register_clipboard, p_wei_block_double + wei_block_space);
|
||||
|
||||
// odd iteration
|
||||
__syncthreads();
|
||||
|
||||
// LDS double buffer: GEMM on current data
|
||||
run_blockwise_batch_gemm(p_wei_block_double + wei_block_space,
|
||||
p_in_block_double + in_block_space,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// output: register to global mem,
|
||||
const auto c_thread_mtx_begin =
|
||||
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
|
||||
const index_t k_thread_data_begin = c_thread_mtx_begin.row;
|
||||
const index_t ho_thread_data_begin = c_thread_mtx_begin.batch;
|
||||
const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock;
|
||||
const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock;
|
||||
|
||||
static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto fwd) { // fwd do nothing but
|
||||
// perfect forwarding.
|
||||
// Using this trick to
|
||||
// make this lambda a generic lambda, so it won't be compiled until
|
||||
// instantiated
|
||||
static_assert(
|
||||
(fwd(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0),
|
||||
"wrong!");
|
||||
|
||||
// output is a 10d tensor
|
||||
constexpr index_t N2 = GemmNPerThreadSubC;
|
||||
constexpr index_t N1 = NPerBlock / N2;
|
||||
|
||||
constexpr index_t W2 =
|
||||
(GemmNLevel0Cluster * GemmNLevel1Cluster) / fwd(NPerBlock / GemmNPerThreadSubC);
|
||||
constexpr index_t W1 = WoPerBlock / W2;
|
||||
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
#if 0
|
||||
constexpr auto out_10d_global_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
|
||||
K1,
|
||||
K2,
|
||||
Ho,
|
||||
Wo / (W1 * W2),
|
||||
W1,
|
||||
W2,
|
||||
N / fwd(N1 * N2),
|
||||
N1,
|
||||
N2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, 1, 1, N2>{});
|
||||
#else
|
||||
constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc)
|
||||
.Fold(I3, Number<N1>{}, Number<N2>{})
|
||||
.Fold(I2, Number<W1>{}, Number<W2>{})
|
||||
.Fold(I0, Number<K1>{}, Number<K2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc)
|
||||
.Fold(I3, Number<1>{}, Number<N2>{})
|
||||
.Fold(I2, Number<W1>{}, Number<1>{})
|
||||
.Fold(I0, Number<1>{}, Number<K2>{});
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
|
||||
"out_k_h_w_n_thread_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
|
||||
"out_k_h_w_n_global_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).Else([&](auto fwd) {
|
||||
static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
"wrong!");
|
||||
|
||||
// output is a 10d tensor
|
||||
constexpr index_t N1 = NPerBlock;
|
||||
|
||||
constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock;
|
||||
constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster;
|
||||
constexpr index_t W1 = WoPerBlock / fwd(W2 * W3);
|
||||
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
#if 0
|
||||
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
|
||||
#else
|
||||
constexpr auto out_10d_global_desc =
|
||||
fwd(out_k_h_w_n_global_desc)
|
||||
.Fold(I3, Number<N1>{})
|
||||
.Fold(I2, Number<W1>{}, Number<W2>{}, Number<W3>{})
|
||||
.Fold(I0, Number<K1>{}, Number<K2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc =
|
||||
fwd(out_k_h_w_n_thread_desc)
|
||||
.Fold(I3, Number<N1>{})
|
||||
.Fold(I2, Number<W1>{}, Number<1>{}, Number<W3>{})
|
||||
.Fold(I0, Number<1>{}, Number<K2>{});
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
|
||||
"out_k_h_w_n_thread_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
|
||||
"out_k_h_w_n_global_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
|
||||
|
||||
for(index_t i = 0; i < 64; ++i)
|
||||
{
|
||||
printf("out %f, ", p_out_thread[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
});
|
||||
}
|
||||
};
|
||||
@@ -1,462 +0,0 @@
|
||||
#pragma once
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
index_t NPerBlock,
|
||||
index_t KPerBlock,
|
||||
index_t CPerBlock,
|
||||
index_t HoPerBlock,
|
||||
index_t WoPerBlock,
|
||||
index_t NPerThread,
|
||||
index_t KPerThread,
|
||||
index_t HoPerThread,
|
||||
index_t WoPerThread,
|
||||
index_t GemmMPerThreadSubC,
|
||||
index_t GemmNPerThreadSubC,
|
||||
index_t GemmMLevel0Cluster,
|
||||
index_t GemmNLevel0Cluster,
|
||||
index_t GemmMLevel1Cluster,
|
||||
index_t GemmNLevel1Cluster,
|
||||
index_t GemmKPerThreadLoop,
|
||||
index_t GemmDataPerReadA,
|
||||
index_t GemmDataPerReadB,
|
||||
class InBlockReorderSrcSubLengths_NCHW,
|
||||
class InBlockReorderSrcClusterLengths_NCHW,
|
||||
class InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
|
||||
index_t InBlockReorderDataPerRead_W,
|
||||
index_t InBlockReorderDataPerWrite_N,
|
||||
class WeiBlockCopyClusterLengths_CK, // not used
|
||||
index_t WeiBlockCopyDataPerRead_K,
|
||||
index_t OutThreadCopyDataPerWrite_N>
|
||||
struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
{
|
||||
// be careful of this assertion
|
||||
static_assert(
|
||||
NPerBlock % NPerThread == 0 &&
|
||||
((GemmNPerThreadSubC <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0) ||
|
||||
(GemmNPerThreadSubC >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0)),
|
||||
"wrong!");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1);
|
||||
|
||||
constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1);
|
||||
constexpr index_t Wo = out_k_h_w_n_global_desc.GetLength(I2);
|
||||
constexpr index_t N = out_k_h_w_n_global_desc.GetLength(I3);
|
||||
|
||||
constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1);
|
||||
constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2);
|
||||
|
||||
// divide block work: [K, Ho, Wo, N]
|
||||
static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 &&
|
||||
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
|
||||
"wrong! cannot evenly divide work for workgroup ");
|
||||
|
||||
constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
|
||||
constexpr index_t HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock;
|
||||
constexpr index_t WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock;
|
||||
constexpr index_t NBlockWork = (N + NPerBlock - 1) / NPerBlock;
|
||||
|
||||
const index_t k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork);
|
||||
index_t itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork);
|
||||
const index_t h_block_work_id = itmp / (WBlockWork * NBlockWork);
|
||||
itmp -= h_block_work_id * (WBlockWork * NBlockWork);
|
||||
const index_t w_block_work_id = itmp / NBlockWork;
|
||||
const index_t n_block_work_id = itmp - w_block_work_id * NBlockWork;
|
||||
|
||||
const index_t k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const index_t ho_block_data_begin = h_block_work_id * HoPerBlock;
|
||||
const index_t wo_block_data_begin = w_block_work_id * WoPerBlock;
|
||||
const index_t n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
|
||||
const index_t hi_block_data_begin = ho_block_data_begin;
|
||||
const index_t wi_block_data_begin = wo_block_data_begin;
|
||||
|
||||
// global tensor view
|
||||
constexpr auto wei_c_k_global_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
Number<InBlockReorderDataPerWrite_N>{});
|
||||
|
||||
// this check is ad-hoc
|
||||
// TODO: need to properly implement tensor descriptor with alignment
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
// input: format is [N, C, Hi, Wi] to [C, Hi, Wi, N]
|
||||
constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};
|
||||
|
||||
const auto blockwise_in_copy_reorder = BlockwiseTensorSliceReorderCopy_v3<
|
||||
BlockSize,
|
||||
Float,
|
||||
decltype(in_n_c_h_w_global_desc),
|
||||
decltype(in_c_h_w_n_block_desc),
|
||||
Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>,
|
||||
InBlockReorderSrcSubLengths_NCHW,
|
||||
InBlockReorderSrcClusterLengths_NCHW,
|
||||
decltype(map_chwn2nchw),
|
||||
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
|
||||
InBlockReorderDataPerRead_W,
|
||||
InBlockReorderDataPerWrite_N>{};
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock, KPerBlock]
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy3<BlockSize,
|
||||
Float,
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopyDataPerRead_K>{};
|
||||
|
||||
// a series of blockwise batched GEMM
|
||||
// C_matrix += transpose(A_matrix) * B_matrix
|
||||
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[C,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
|
||||
constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_c_k_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto b_c_wn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_c_h_w_n_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto c_k_wn_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{},
|
||||
Number<WoPerThread * NPerThread>{},
|
||||
Number<out_k_h_w_n_thread_desc.GetStride(I0)>{});
|
||||
|
||||
const auto blockwise_batch_gemm =
|
||||
BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2<
|
||||
BlockSize,
|
||||
decltype(a_c_k_block_mtx_desc),
|
||||
decltype(b_c_wn_block_mtx_desc),
|
||||
decltype(c_k_wn_thread_mtx_desc),
|
||||
0,
|
||||
in_c_h_w_n_block_desc.GetStride(I1),
|
||||
out_k_h_w_n_thread_desc.GetStride(I1),
|
||||
HoPerBlock,
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
GemmNLevel0Cluster,
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop,
|
||||
HoPerThread,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB>{};
|
||||
|
||||
// choose GEMM implementation here
|
||||
const auto run_blockwise_batch_gemm = [&](auto... Xs) {
|
||||
#if 0
|
||||
return blockwise_batch_gemm.Run(Xs...);
|
||||
#elif 0
|
||||
return blockwise_batch_gemm.Run_asm(Xs...);
|
||||
#else
|
||||
return blockwise_batch_gemm.Run_asm_v2(Xs...);
|
||||
#endif
|
||||
};
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t in_block_space =
|
||||
in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
|
||||
constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
__shared__ Float p_in_block[in_block_space];
|
||||
__shared__ Float p_wei_block[wei_block_space];
|
||||
|
||||
// register
|
||||
// C++ lambda doesn't capture array, use pointer instead
|
||||
Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()];
|
||||
Float* const p_out_thread = p_out_thread_data;
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_c_k_block_desc, "wei_c_k_block_desc");
|
||||
|
||||
printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space);
|
||||
}
|
||||
#endif
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
|
||||
|
||||
#if 1
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
|
||||
{
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 1
|
||||
blockwise_in_copy_reorder.Run(
|
||||
p_in_global_block_offset +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
p_in_block);
|
||||
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_wei_block);
|
||||
#else
|
||||
Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()];
|
||||
Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy_reorder.RunLoadRegisterClipboard(
|
||||
p_in_global_block_offset +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
p_in_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(
|
||||
p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_wei_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block);
|
||||
|
||||
blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block);
|
||||
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset +=
|
||||
CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
p_wei_global_block_offset +=
|
||||
CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
|
||||
{
|
||||
#if 0
|
||||
blockwise_in_copy_reorder.Run(p_in_global_block_offset,
|
||||
p_in_block);
|
||||
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset,
|
||||
p_wei_block);
|
||||
#else
|
||||
Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()];
|
||||
Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_clipboard);
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block);
|
||||
blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// output: register to global mem,
|
||||
const auto c_thread_mtx_begin =
|
||||
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
|
||||
const index_t k_thread_data_begin = c_thread_mtx_begin.row;
|
||||
const index_t ho_thread_data_begin = c_thread_mtx_begin.batch;
|
||||
const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock;
|
||||
const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock;
|
||||
|
||||
static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto f_dummy) { // f_dummy do nothing but
|
||||
// perfect forwarding.
|
||||
// Using this trick to
|
||||
// make this lambda a generic lambda, so it won't be compiled until
|
||||
// instantiated
|
||||
static_assert(
|
||||
(f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0),
|
||||
"wrong!");
|
||||
|
||||
// output is a 10d tensor
|
||||
constexpr index_t N2 = GemmNPerThreadSubC;
|
||||
constexpr index_t N1 = NPerBlock / N2;
|
||||
|
||||
constexpr index_t W2 =
|
||||
(GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC);
|
||||
constexpr index_t W1 = WoPerBlock / W2;
|
||||
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
constexpr auto out_10d_global_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
|
||||
K1,
|
||||
K2,
|
||||
Ho,
|
||||
Wo / (W1 * W2),
|
||||
W1,
|
||||
W2,
|
||||
N / f_dummy(N1 * N2),
|
||||
N1,
|
||||
N2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, 1, 1, N2>{});
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
|
||||
"out_k_h_w_n_thread_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
|
||||
"out_k_h_w_n_global_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).Else([&](auto f_dummy) {
|
||||
static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
"wrong!");
|
||||
|
||||
// output is a 10d tensor
|
||||
constexpr index_t N1 = NPerBlock;
|
||||
|
||||
constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock;
|
||||
constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster;
|
||||
constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3);
|
||||
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
|
||||
"out_k_h_w_n_thread_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
|
||||
"out_k_h_w_n_global_desc");
|
||||
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
|
||||
|
||||
for(index_t i = 0; i < 64; ++i)
|
||||
{
|
||||
printf("out %f, ", p_out_thread[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
});
|
||||
}
|
||||
};
|
||||
@@ -1,13 +1,17 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_generic_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
@@ -78,10 +82,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
|
||||
"wrong! cannot evenly divide work for workgroup ");
|
||||
|
||||
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
|
||||
constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
constexpr index_t NBlockWork = math::integer_divide_ceil(N, NPerBlock);
|
||||
constexpr index_t KBlockWork = math::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
@@ -103,10 +107,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
@@ -119,7 +123,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
|
||||
@@ -230,7 +234,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
#endif
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
|
||||
threadwise_generic_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
|
||||
|
||||
#if 0
|
||||
const Float* p_in_global_block_offset =
|
||||
@@ -436,8 +440,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
make_zero_array<index_t, 10>(),
|
||||
out_10d_thread_desc.GetLengths().ReorderGivenNew2Old(map_out_global2thread),
|
||||
arithmetic_sequence_gen<0, 10, 1>::SeqType{});
|
||||
arithmetic_sequence_gen<0, 10, 1>::SeqType{},
|
||||
Number<1>{});
|
||||
#endif
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
#include "threadwise_generic_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
@@ -40,7 +44,7 @@ template <index_t GridSize,
|
||||
class WeiBlockCopyClusterLengths_CK, // not used
|
||||
index_t WeiBlockCopyDataPerRead_K,
|
||||
index_t OutThreadCopyDataPerWrite_W>
|
||||
struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
@@ -81,10 +85,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0,
|
||||
"wrong! cannot evenly divide work for workgroup ");
|
||||
|
||||
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
|
||||
constexpr index_t KBlockWork = mod_conv::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
constexpr index_t NBlockWork = math::integer_divide_ceil(N, NPerBlock);
|
||||
constexpr index_t KBlockWork = math::integer_divide_ceil(K, KPerBlock);
|
||||
constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
@@ -105,10 +109,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
@@ -121,7 +125,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
|
||||
@@ -233,7 +237,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
#endif
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
|
||||
threadwise_generic_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
|
||||
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
@@ -487,8 +491,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
make_zero_array<index_t, 10>(),
|
||||
out_10d_thread_desc.GetLengths().ReorderGivenNew2Old(map_out_global2thread),
|
||||
arithmetic_sequence_gen<0, 10, 1>::SeqType{});
|
||||
arithmetic_sequence_gen<0, 10, 1>::SeqType{},
|
||||
Number<1>{});
|
||||
#endif
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
#endif
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
@@ -6,6 +8,8 @@
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// define B = flatten(N, Hi, Wi)
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -181,7 +185,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t max_align =
|
||||
mod_conv::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
math::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
|
||||
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
@@ -275,3 +279,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// define B = flatten(N, Hi, Wi)
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -185,7 +189,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t max_align =
|
||||
mod_conv::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
math::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
|
||||
constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
@@ -404,3 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// define B = merge(N0, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -146,7 +150,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
// be careful of LDS alignment
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
|
||||
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
@@ -218,10 +222,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
};
|
||||
|
||||
// LDS allocation for input and weight: be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr index_t in_block_space =
|
||||
in_c_n1_b_n2_block_mem_desc.GetElementSpace(Number<max_align>{});
|
||||
@@ -368,3 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// define B = merge(N0, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -34,7 +38,7 @@ template <index_t GridSize,
|
||||
class WeiBlockCopySubLengths_C_K,
|
||||
class WeiBlockCopyClusterLengths_C_K,
|
||||
index_t WeiBlockCopyDataPerAccess_K>
|
||||
struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
@@ -143,7 +147,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
// be careful of LDS alignment
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
|
||||
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
@@ -215,10 +219,10 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
};
|
||||
|
||||
// LDS allocation for input and weight: be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr index_t in_block_space =
|
||||
in_c_n1_b_n2_block_mem_desc.GetElementSpace(Number<max_align>{});
|
||||
@@ -395,3 +399,6 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namesspace ck
|
||||
#endif
|
||||
@@ -1,11 +1,15 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
#include "threadwise_generic_tensor_slice_op.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// define B = merge(N0, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -176,7 +180,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
// be careful of LDS alignment
|
||||
constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<EPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
@@ -248,10 +252,10 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
};
|
||||
|
||||
// LDS allocation for input and weight: be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDstDataPerWrite_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDstDataPerWrite_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr index_t in_block_space =
|
||||
in_e_n1_b_n2_block_desc.GetElementSpace(Number<max_align>{});
|
||||
@@ -345,3 +349,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
#pragma once
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
|
||||
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
#include "threadwise_generic_tensor_slice_op.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// define B = merge(N0, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -42,7 +46,7 @@ template <index_t GridSize,
|
||||
class WeiBlockCopyDstAccessOrder,
|
||||
index_t WeiBlockCopySrcDataPerRead_E,
|
||||
index_t WeiBlockCopyDstDataPerWrite_K>
|
||||
struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
@@ -165,7 +169,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
// be careful of LDS alignment
|
||||
constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<EPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
@@ -237,10 +241,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
};
|
||||
|
||||
// LDS allocation for input and weight: be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDstDataPerWrite_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDstDataPerWrite_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr index_t in_block_space =
|
||||
in_e_n1_b_n2_block_desc.GetElementSpace(Number<max_align>{});
|
||||
@@ -410,3 +414,6 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
16
src/include/gridwise_convolution_kernel_wrapper.hpp
Normal file
16
src/include/gridwise_convolution_kernel_wrapper.hpp
Normal file
@@ -0,0 +1,16 @@
|
||||
#ifndef CK_GRIDWISE_CONVOLUTION_KERNEL_WRAPPER
|
||||
#define CK_GRIDWISE_CONVOLUTION_KERNEL_WRAPPER
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class GridwiseConvolution, class T>
|
||||
__global__ void run_gridwise_convolution_kernel(const T* const __restrict__ p_in_global,
|
||||
const T* const __restrict__ p_wei_global,
|
||||
T* const __restrict__ p_out_global)
|
||||
{
|
||||
GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global);
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,9 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
template <class GridwiseConvolution, class T>
|
||||
__global__ void run_gridwise_convolution(const T* const __restrict__ p_in_global,
|
||||
const T* const __restrict__ p_wei_global,
|
||||
T* const __restrict__ p_out_global)
|
||||
{
|
||||
GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global);
|
||||
}
|
||||
@@ -7,6 +7,8 @@
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_direct_convolution.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class TInWei,
|
||||
class TOut,
|
||||
class TAccum,
|
||||
@@ -253,3 +255,5 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_nkhw_thread_desc.GetLengths());
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
class Float,
|
||||
@@ -292,3 +294,5 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
|
||||
out_hkwn_thread_desc.GetLengths(),
|
||||
reorder_khwn_from_hkwn);
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
#pragma once
|
||||
#ifndef CK_INTEGRAL_CONSTANT_HPP
|
||||
#define CK_INTEGRAL_CONSTANT_HPP
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class T, T N>
|
||||
struct integral_constant
|
||||
@@ -16,3 +19,6 @@ __host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_c
|
||||
|
||||
template <index_t N>
|
||||
using Number = integral_constant<index_t, N>;
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
#ifndef CK_TENSOR_HPP
|
||||
#define CK_TENSOR_HPP
|
||||
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
@@ -266,3 +268,5 @@ struct Tensor
|
||||
TensorDescriptor mDesc;
|
||||
std::vector<T> mData;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#pragma once
|
||||
#ifndef CK_THREADWISE_4D_TENSOR_OP_HPP
|
||||
#define CK_THREADWISE_4D_TENSOR_OP_HPP
|
||||
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class Float, class Desc, class IDim, class NShift>
|
||||
__device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
|
||||
{
|
||||
@@ -50,3 +54,6 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#pragma once
|
||||
#ifndef CK_THREADWISE_DIRECT_CONVOLUTION_HPP
|
||||
#define CK_THREADWISE_DIRECT_CONVOLUTION_HPP
|
||||
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_copy.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// optimized for scenario if p_in, p_wei, p_out are in register
|
||||
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
|
||||
@@ -218,3 +222,6 @@ __device__ void threadwise_direct_convolution_3(InDesc,
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_THREADWISE_GEMM_HPP
|
||||
#define CK_THREADWISE_GEMM_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class Float, class Matrix>
|
||||
__device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
|
||||
{
|
||||
@@ -114,3 +118,6 @@ __device__ void threadwise_gemm(MatrixA,
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
19
src/include/threadwise_generic_tensor_op.hpp
Normal file
19
src/include/threadwise_generic_tensor_op.hpp
Normal file
@@ -0,0 +1,19 @@
|
||||
#ifndef CK_THREADWISE_GENERIC_TENSOR_OP_HPP
|
||||
#define CK_THREADWISE_GENERIC_TENSOR_OP_HPP
|
||||
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
template <class Float, class TDesc>
|
||||
__device__ void threadwise_generic_tensor_set_zero(TDesc, Float* __restrict__ p)
|
||||
{
|
||||
static_ford<decltype(TDesc::GetLengths())>{}([&](auto multi_id) {
|
||||
constexpr index_t offset = TDesc::GetOffsetFromMultiIndex(multi_id);
|
||||
|
||||
p[offset] = static_cast<Float>(0);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
|
||||
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
|
||||
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class Float,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
@@ -97,3 +101,6 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -1,6 +1,10 @@
|
||||
#pragma once
|
||||
#ifndef CK_THREADWISE_TENSOR_SLICE_COPY_HPP
|
||||
#define CK_THREADWISE_TENSOR_SLICE_COPY_HPP
|
||||
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// need to assume src and dst is aligned
|
||||
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
|
||||
__device__ void threadwise_tensor_slice_copy(SrcDesc,
|
||||
@@ -192,3 +196,6 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -1,4 +1,7 @@
|
||||
#pragma once
|
||||
#ifndef CK_BASE_HPP
|
||||
#define CK_BASE_HPP
|
||||
|
||||
namespace ck {
|
||||
|
||||
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
|
||||
|
||||
@@ -22,7 +25,7 @@ __host__ __device__ constexpr bool is_same_type(X, Y)
|
||||
return is_same<X, Y>::value;
|
||||
}
|
||||
|
||||
namespace mod_conv { // namespace mod_conv
|
||||
namespace math { // namespace math
|
||||
template <class T, T s>
|
||||
struct scales
|
||||
{
|
||||
@@ -110,4 +113,7 @@ __host__ __device__ constexpr T lcm(T x, Ts... xs)
|
||||
return max(x, xs...);
|
||||
}
|
||||
|
||||
} // namespace mod_conv
|
||||
} // namespace math
|
||||
} // namspace ck
|
||||
|
||||
#endif
|
||||
@@ -1,7 +1,11 @@
|
||||
#pragma once
|
||||
#include "config.h"
|
||||
#ifndef CK_VECTOR_TYPE_HPP
|
||||
#define CK_VECTOR_TYPE_HPP
|
||||
|
||||
#include "config.hpp"
|
||||
#include "integral_constant.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class T, index_t N>
|
||||
struct vector_type
|
||||
{
|
||||
@@ -184,3 +188,7 @@ struct vector_type<char4, 2>
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user