mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
Merge branch 'direct_fp16'
This commit is contained in:
@@ -1,16 +1,16 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_direct_convolution_2.hip.hpp"
|
||||
#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_direct_convolution_2(InDesc,
|
||||
const Tensor<T>& in,
|
||||
WeiDesc,
|
||||
const Tensor<T>& wei,
|
||||
OutDesc,
|
||||
Tensor<T>& out,
|
||||
unsigned nrepeat)
|
||||
void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
|
||||
const Tensor<T>& in,
|
||||
WeiDesc,
|
||||
const Tensor<T>& wei,
|
||||
OutDesc,
|
||||
Tensor<T>& out,
|
||||
unsigned nrepeat)
|
||||
{
|
||||
std::size_t data_sz = sizeof(T);
|
||||
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
|
||||
@@ -46,6 +46,27 @@ void device_direct_convolution_2(InDesc,
|
||||
constexpr unsigned HoPerThread = 2;
|
||||
constexpr unsigned WoPerThread = 2;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 2;
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 1
|
||||
// 3x3, 34x34, 128 thread, fp16
|
||||
constexpr unsigned NPerBlock = 2;
|
||||
constexpr unsigned KPerBlock = 32;
|
||||
constexpr unsigned CPerBlock = 4;
|
||||
constexpr unsigned HoPerBlock = 2;
|
||||
constexpr unsigned WoPerBlock = 32;
|
||||
|
||||
constexpr unsigned NPerThread = 2;
|
||||
constexpr unsigned KPerThread = 4;
|
||||
constexpr unsigned CPerThread = 2;
|
||||
constexpr unsigned HoPerThread = 2;
|
||||
constexpr unsigned WoPerThread = 2;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 2;
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#endif
|
||||
|
||||
@@ -57,27 +78,30 @@ void device_direct_convolution_2(InDesc,
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
float time = launch_kernel(gridwise_direct_convolution_2<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
BlockSize,
|
||||
GridSize>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
|
||||
float time =
|
||||
launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
InBlockCopyDataPerRead,
|
||||
WeiBlockCopyDataPerRead,
|
||||
BlockSize,
|
||||
GridSize>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
211
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
Normal file
211
driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
Normal file
@@ -0,0 +1,211 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
|
||||
|
||||
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
|
||||
const Tensor<TInWei>& in_nchw,
|
||||
WeiDesc,
|
||||
const Tensor<TInWei>& wei_kcyx,
|
||||
OutDesc,
|
||||
Tensor<TOut>& out_nkhw,
|
||||
unsigned nrepeat)
|
||||
{
|
||||
// this suppose in / wei data type is int8x4
|
||||
constexpr unsigned NVector = 4;
|
||||
using accum_t = int32_t;
|
||||
using vector_t = vector_type<TInWei, NVector>;
|
||||
using vector_mem_t = typename vector_t::MemoryType;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_nchw_desc = InDesc{};
|
||||
constexpr auto wei_kcyx_desc = WeiDesc{};
|
||||
constexpr auto out_nkhw_desc = OutDesc{};
|
||||
|
||||
constexpr unsigned Hi = in_nchw_desc.GetLength(I2);
|
||||
constexpr unsigned Wi = in_nchw_desc.GetLength(I3);
|
||||
|
||||
constexpr unsigned N = out_nkhw_desc.GetLength(I0);
|
||||
constexpr unsigned Ho = out_nkhw_desc.GetLength(I2);
|
||||
constexpr unsigned Wo = out_nkhw_desc.GetLength(I3);
|
||||
|
||||
constexpr unsigned K = wei_kcyx_desc.GetLength(I0);
|
||||
constexpr unsigned C = wei_kcyx_desc.GetLength(I1);
|
||||
constexpr unsigned Y = wei_kcyx_desc.GetLength(I2);
|
||||
constexpr unsigned X = wei_kcyx_desc.GetLength(I3);
|
||||
|
||||
// vectorized input
|
||||
auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
|
||||
ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
|
||||
|
||||
Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
|
||||
|
||||
auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
|
||||
#if 0
|
||||
in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
|
||||
#elif 0
|
||||
in_nchw_vec(n, c, h, w) =
|
||||
vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
|
||||
#elif 1
|
||||
in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
|
||||
in_nchw(n, 4 * c + 1, h, w),
|
||||
in_nchw(n, 4 * c + 2, h, w),
|
||||
in_nchw(n, 4 * c + 3, h, w));
|
||||
#endif
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)(
|
||||
std::thread::hardware_concurrency());
|
||||
|
||||
// vectorize weight
|
||||
auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
|
||||
ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
|
||||
|
||||
Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
|
||||
|
||||
auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
|
||||
#if 0
|
||||
wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
|
||||
#elif 0
|
||||
wei_kcyx_vec(k, c, y, x) =
|
||||
vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
|
||||
#elif 1
|
||||
wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x),
|
||||
wei_kcyx(k, 4 * c + 1, y, x),
|
||||
wei_kcyx(k, 4 * c + 2, y, x),
|
||||
wei_kcyx(k, 4 * c + 3, y, x));
|
||||
#endif
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)(
|
||||
std::thread::hardware_concurrency());
|
||||
|
||||
//
|
||||
DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
|
||||
DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
|
||||
DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace());
|
||||
|
||||
in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
|
||||
wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data());
|
||||
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
|
||||
|
||||
#if 0
|
||||
// 3x3, 34x34, 128 thread, fp32, vector = 1
|
||||
constexpr unsigned NPerBlock = 2;
|
||||
constexpr unsigned KPerBlock = 32;
|
||||
constexpr unsigned CPerBlock = 4;
|
||||
constexpr unsigned HoPerBlock = 2;
|
||||
constexpr unsigned WoPerBlock = 32;
|
||||
|
||||
constexpr unsigned NPerThread = 2;
|
||||
constexpr unsigned KPerThread = 4;
|
||||
constexpr unsigned CPerThread = 2;
|
||||
constexpr unsigned HoPerThread = 2;
|
||||
constexpr unsigned WoPerThread = 2;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 2;
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
// 3x3, 34x34, 128 thread, fp32, vector = 2
|
||||
constexpr unsigned NPerBlock = 2;
|
||||
constexpr unsigned KPerBlock = 32;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
constexpr unsigned HoPerBlock = 2;
|
||||
constexpr unsigned WoPerBlock = 32;
|
||||
|
||||
constexpr unsigned NPerThread = 2;
|
||||
constexpr unsigned KPerThread = 4;
|
||||
constexpr unsigned CPerThread = 1;
|
||||
constexpr unsigned HoPerThread = 2;
|
||||
constexpr unsigned WoPerThread = 2;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 2;
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
// 3x3, 34x34, 128 thread, int8, vector = 4
|
||||
constexpr unsigned NPerBlock = 2;
|
||||
constexpr unsigned KPerBlock = 32;
|
||||
constexpr unsigned CPerBlock = 8;
|
||||
constexpr unsigned HoPerBlock = 4;
|
||||
constexpr unsigned WoPerBlock = 32;
|
||||
|
||||
constexpr unsigned NPerThread = 1;
|
||||
constexpr unsigned KPerThread = 8;
|
||||
constexpr unsigned CPerThread = 2;
|
||||
constexpr unsigned HoPerThread = 4;
|
||||
constexpr unsigned WoPerThread = 2;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 2;
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 1
|
||||
// 1x1, 32x32, 128 thread, int8, vector = 4
|
||||
constexpr unsigned NPerBlock = 1;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 16;
|
||||
constexpr unsigned HoPerBlock = 4;
|
||||
constexpr unsigned WoPerBlock = 32;
|
||||
|
||||
constexpr unsigned NPerThread = 1;
|
||||
constexpr unsigned KPerThread = 8;
|
||||
constexpr unsigned CPerThread = 2;
|
||||
constexpr unsigned HoPerThread = 4;
|
||||
constexpr unsigned WoPerThread = 2;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 2;
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#endif
|
||||
|
||||
constexpr unsigned GridSize =
|
||||
(N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
float time = launch_kernel(
|
||||
gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<TInWei,
|
||||
TOut,
|
||||
accum_t,
|
||||
decltype(in_nchw_vec_desc),
|
||||
decltype(wei_kcyx_vec_desc),
|
||||
decltype(out_nkhw_desc),
|
||||
NVector,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
InBlockCopyDataPerRead,
|
||||
WeiBlockCopyDataPerRead,
|
||||
BlockSize,
|
||||
GridSize>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<TInWei*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TInWei*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TInWei*>(out_nkhw_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
}
|
||||
|
||||
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
|
||||
}
|
||||
@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
|
||||
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
|
||||
out_khwn_device_buf.ToDevice(out_khwn.mData.data());
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
// for 3x3, 34x34
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// for 1x1, 28x28
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 128;
|
||||
@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
|
||||
|
||||
constexpr unsigned OutThreadCopyDataPerWrite = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 1
|
||||
// for 1x1, 14x14
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 128;
|
||||
constexpr unsigned CPerBlock = 8;
|
||||
constexpr unsigned HoPerBlock = 2;
|
||||
constexpr unsigned WoPerBlock = 2;
|
||||
|
||||
constexpr unsigned NPerThread = 4;
|
||||
constexpr unsigned KPerThread = 16;
|
||||
constexpr unsigned CPerThread = 1;
|
||||
constexpr unsigned HoPerThread = 1;
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned InBlockCopy_ThreadPerDimC = 8;
|
||||
constexpr unsigned InBlockCopy_ThreadPerDimH = 2;
|
||||
constexpr unsigned InBlockCopy_ThreadPerDimW = 2;
|
||||
constexpr unsigned InBlockCopy_ThreadPerDimN = 4;
|
||||
constexpr unsigned InBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned GemmMPerThreadSubC = 4;
|
||||
constexpr unsigned GemmNPerThreadSubC = 4;
|
||||
constexpr unsigned GemmMLevel0Cluster = 4;
|
||||
constexpr unsigned GemmNLevel0Cluster = 2;
|
||||
constexpr unsigned GemmMLevel1Cluster = 2;
|
||||
constexpr unsigned GemmNLevel1Cluster = 4;
|
||||
constexpr unsigned GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr unsigned OutThreadCopyDataPerWrite = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
@@ -209,39 +210,43 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
float time =
|
||||
launch_kernel(gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer<
|
||||
GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
decltype(in_chwn_desc),
|
||||
decltype(wei_cyxk_desc),
|
||||
decltype(out_khwn_desc),
|
||||
BPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
BPerThread,
|
||||
KPerThread,
|
||||
GemmThreadPerColumnPerCluster,
|
||||
GemmThreadPerRowPerCluster,
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
GemmNLevel0Cluster,
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop,
|
||||
InBlockCopyThreadPerDim0,
|
||||
InBlockCopyThreadPerDim1,
|
||||
WeiBlockCopyThreadPerDim0,
|
||||
WeiBlockCopyThreadPerDim1,
|
||||
InBlockCopyDataPerRead,
|
||||
WeiBlockCopyDataPerRead>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
|
||||
float time = launch_kernel(
|
||||
#if 0
|
||||
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
|
||||
#else
|
||||
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
|
||||
#endif
|
||||
<GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
decltype(in_chwn_desc),
|
||||
decltype(wei_cyxk_desc),
|
||||
decltype(out_khwn_desc),
|
||||
BPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
BPerThread,
|
||||
KPerThread,
|
||||
GemmThreadPerColumnPerCluster,
|
||||
GemmThreadPerRowPerCluster,
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
GemmNLevel0Cluster,
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop,
|
||||
InBlockCopyThreadPerDim0,
|
||||
InBlockCopyThreadPerDim1,
|
||||
WeiBlockCopyThreadPerDim0,
|
||||
WeiBlockCopyThreadPerDim1,
|
||||
InBlockCopyDataPerRead,
|
||||
WeiBlockCopyDataPerRead>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -7,10 +7,11 @@
|
||||
#include "tensor.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "conv_common.hip.hpp"
|
||||
#include "device_direct_convolution_1.hpp"
|
||||
#include "device_direct_convolution_2.hpp"
|
||||
//#include "device_direct_convolution_1.hpp"
|
||||
#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
|
||||
#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
|
||||
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
|
||||
#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
|
||||
|
||||
struct GeneratorTensor_1
|
||||
@@ -34,25 +35,6 @@ struct GeneratorTensor_2
|
||||
}
|
||||
};
|
||||
|
||||
struct GeneratorTensor_3
|
||||
{
|
||||
template <class... Is>
|
||||
double operator()(Is... is)
|
||||
{
|
||||
#if 0
|
||||
std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
|
||||
return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
|
||||
#elif 1
|
||||
assert(sizeof...(Is) > 0);
|
||||
std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
|
||||
std::vector<std::size_t> lens(sizeof...(Is), 100);
|
||||
std::vector<std::size_t> strides(sizeof...(Is), 1);
|
||||
std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1);
|
||||
return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
struct GeneratorTensor_Checkboard
|
||||
{
|
||||
template <class... Ts>
|
||||
@@ -106,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc)
|
||||
return TensorDescriptor(lengths, strides);
|
||||
}
|
||||
|
||||
template <class T, class LowerPads, class UpperPads>
|
||||
void host_direct_convolution(
|
||||
const Tensor<T>& in_nchw, const Tensor<T>& wei_kcyx, Tensor<T>& out, LowerPads, UpperPads)
|
||||
template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
|
||||
void host_direct_convolution(const Tensor<TIn>& in_nchw,
|
||||
const Tensor<TWei>& wei_kcyx,
|
||||
Tensor<TOut>& out_nkhw,
|
||||
LowerPads,
|
||||
UpperPads)
|
||||
{
|
||||
unsigned h_pad_low = LowerPads{}.Get(Number<0>{});
|
||||
unsigned w_pad_low = LowerPads{}.Get(Number<1>{});
|
||||
@@ -129,26 +114,29 @@ void host_direct_convolution(
|
||||
if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in_nchw.mDesc.GetLengths()[3])
|
||||
{
|
||||
v += in_nchw(n, c, hi, wi) * wei_kcyx(k, c, y, x);
|
||||
v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
out(n, k, ho, wo) = v;
|
||||
out_nkhw(n, k, ho, wo) = v;
|
||||
};
|
||||
|
||||
auto f_par = make_ParallelTensorFunctor(f,
|
||||
out.mDesc.GetLengths()[0],
|
||||
out.mDesc.GetLengths()[1],
|
||||
out.mDesc.GetLengths()[2],
|
||||
out.mDesc.GetLengths()[3]);
|
||||
out_nkhw.mDesc.GetLengths()[0],
|
||||
out_nkhw.mDesc.GetLengths()[1],
|
||||
out_nkhw.mDesc.GetLengths()[2],
|
||||
out_nkhw.mDesc.GetLengths()[3]);
|
||||
|
||||
f_par(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <class T, class LowerPads, class UpperPads>
|
||||
void host_winograd_3x3_convolution(
|
||||
const Tensor<T>& in_nchw, const Tensor<T>& wei_kcyx, Tensor<T>& out, LowerPads, UpperPads)
|
||||
template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
|
||||
void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
|
||||
const Tensor<TWei>& wei_kcyx,
|
||||
Tensor<TOut>& out_nkhw,
|
||||
LowerPads,
|
||||
UpperPads)
|
||||
{
|
||||
constexpr std::size_t HoPerTile = 2;
|
||||
constexpr std::size_t WoPerTile = 2;
|
||||
@@ -162,8 +150,8 @@ void host_winograd_3x3_convolution(
|
||||
std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
|
||||
std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
|
||||
|
||||
std::size_t HO = out.mDesc.GetLengths()[2];
|
||||
std::size_t WO = out.mDesc.GetLengths()[3];
|
||||
std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
|
||||
std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
|
||||
|
||||
unsigned h_pad_low = LowerPads{}.Get(Number<0>{});
|
||||
unsigned w_pad_low = LowerPads{}.Get(Number<1>{});
|
||||
@@ -177,11 +165,11 @@ void host_winograd_3x3_convolution(
|
||||
std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
|
||||
std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
|
||||
|
||||
Tensor<T> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
||||
Tensor<T> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
||||
Tensor<T> wei_transform({K, C, HiPerTile, WiPerTile});
|
||||
Tensor<T> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
|
||||
Tensor<T> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
|
||||
Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
||||
Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
||||
Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
|
||||
Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
|
||||
Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
|
||||
|
||||
auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
|
||||
for(int j = 0; j < HiPerTile; ++j)
|
||||
@@ -198,7 +186,7 @@ void host_winograd_3x3_convolution(
|
||||
}
|
||||
else
|
||||
{
|
||||
in_hold(n, c, htile, wtile, j, i) = T(0);
|
||||
in_hold(n, c, htile, wtile, j, i) = TIn(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -259,49 +247,61 @@ void host_winograd_3x3_convolution(
|
||||
};
|
||||
|
||||
auto f_wei_transform = [&](auto k, auto c) {
|
||||
wei_transform(k, c, 0, 0) = wei_kcyx(k, c, 0, 0);
|
||||
wei_transform(k, c, 0, 1) =
|
||||
0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2);
|
||||
wei_transform(k, c, 0, 2) =
|
||||
0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2);
|
||||
wei_transform(k, c, 0, 3) = wei_kcyx(k, c, 0, 2);
|
||||
wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
|
||||
wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.5 * double(wei_kcyx(k, c, 0, 2));
|
||||
wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
|
||||
0.5 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.5 * double(wei_kcyx(k, c, 0, 2));
|
||||
wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
|
||||
|
||||
wei_transform(k, c, 1, 0) =
|
||||
0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0);
|
||||
wei_transform(k, c, 1, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) +
|
||||
0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) +
|
||||
0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) +
|
||||
0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) +
|
||||
0.25 * wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 1, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) +
|
||||
0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) -
|
||||
0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) +
|
||||
0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) +
|
||||
0.25 * wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 1, 3) =
|
||||
0.5 * wei_kcyx(k, c, 0, 2) + 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 1, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 0));
|
||||
wei_transform(k, c, 1, 1) =
|
||||
0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
|
||||
0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 1, 2) =
|
||||
0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
|
||||
0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
|
||||
0.5 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 2));
|
||||
|
||||
wei_transform(k, c, 2, 0) =
|
||||
0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0);
|
||||
wei_transform(k, c, 2, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) +
|
||||
0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) -
|
||||
0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) +
|
||||
0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) +
|
||||
0.25 * wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 2, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) +
|
||||
0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) +
|
||||
0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) +
|
||||
0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) +
|
||||
0.25 * wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 2, 3) =
|
||||
0.5 * wei_kcyx(k, c, 0, 2) - 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
|
||||
0.5 * double(wei_kcyx(k, c, 1, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 0));
|
||||
wei_transform(k, c, 2, 1) =
|
||||
0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
|
||||
0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 2, 2) =
|
||||
0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
|
||||
0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
|
||||
0.5 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 2));
|
||||
|
||||
wei_transform(k, c, 3, 0) = wei_kcyx(k, c, 2, 0);
|
||||
wei_transform(k, c, 3, 1) =
|
||||
0.5 * wei_kcyx(k, c, 2, 0) + 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 3, 2) =
|
||||
0.5 * wei_kcyx(k, c, 2, 0) - 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 3, 3) = wei_kcyx(k, c, 2, 2);
|
||||
wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
|
||||
wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
|
||||
0.5 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
|
||||
};
|
||||
|
||||
auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
|
||||
@@ -354,7 +354,7 @@ void host_winograd_3x3_convolution(
|
||||
for(int i = 0; i < WoPerTile; ++i)
|
||||
{
|
||||
std::size_t wo = WoPerTile * wtile + i;
|
||||
out(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
|
||||
out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -372,20 +372,25 @@ void host_winograd_3x3_convolution(
|
||||
template <class T>
|
||||
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
|
||||
{
|
||||
// printf("\n");
|
||||
|
||||
float error = 0;
|
||||
float max_diff = -1;
|
||||
float ref_value = 0, result_value = 0;
|
||||
for(int i = 0; i < ref.mData.size(); ++i)
|
||||
{
|
||||
error += std::abs(ref.mData[i] - result.mData[i]);
|
||||
float diff = std::abs(ref.mData[i] - result.mData[i]);
|
||||
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
||||
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
||||
if(max_diff < diff)
|
||||
{
|
||||
max_diff = diff;
|
||||
ref_value = ref.mData[i];
|
||||
result_value = result.mData[i];
|
||||
}
|
||||
|
||||
// printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i]));
|
||||
}
|
||||
// printf("\n");
|
||||
|
||||
std::cout << "error: " << error << std::endl;
|
||||
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
|
||||
@@ -404,7 +409,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr unsigned HPad = 0;
|
||||
constexpr unsigned WPad = 0;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 3x3, 34x34
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 256;
|
||||
@@ -563,6 +568,30 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr unsigned HPad = 2;
|
||||
constexpr unsigned WPad = 2;
|
||||
#elif 0
|
||||
// 1x1 filter, 32x32 image
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 256;
|
||||
constexpr unsigned HI = 32;
|
||||
constexpr unsigned WI = 32;
|
||||
constexpr unsigned K = 512;
|
||||
constexpr unsigned Y = 1;
|
||||
constexpr unsigned X = 1;
|
||||
|
||||
constexpr unsigned HPad = 0;
|
||||
constexpr unsigned WPad = 0;
|
||||
#elif 1
|
||||
// 1x1 filter, 14x14 image
|
||||
constexpr unsigned N = 128;
|
||||
constexpr unsigned C = 2048;
|
||||
constexpr unsigned HI = 14;
|
||||
constexpr unsigned WI = 14;
|
||||
constexpr unsigned K = 512;
|
||||
constexpr unsigned Y = 1;
|
||||
constexpr unsigned X = 1;
|
||||
|
||||
constexpr unsigned HPad = 0;
|
||||
constexpr unsigned WPad = 0;
|
||||
#endif
|
||||
|
||||
auto lower_pads = Sequence<HPad, WPad>{};
|
||||
@@ -577,10 +606,12 @@ int main(int argc, char* argv[])
|
||||
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
|
||||
ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
|
||||
|
||||
Tensor<float> in_nchw(make_TensorDescriptor(in_nchw_desc));
|
||||
Tensor<float> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
|
||||
Tensor<float> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
|
||||
Tensor<float> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
|
||||
using in_data_t = float;
|
||||
using out_data_t = float;
|
||||
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
|
||||
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
|
||||
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
|
||||
Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
@@ -601,9 +632,13 @@ int main(int argc, char* argv[])
|
||||
#elif 1
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
#elif 1
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread);
|
||||
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
#elif 0
|
||||
in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
|
||||
|
||||
auto gen_wei = [](auto... is) {
|
||||
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
|
||||
};
|
||||
wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -611,7 +646,9 @@ int main(int argc, char* argv[])
|
||||
#if 0
|
||||
device_direct_convolution_1
|
||||
#elif 0
|
||||
device_direct_convolution_2
|
||||
device_direct_convolution_2_nchw_kcyx_nkhw
|
||||
#elif 0
|
||||
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
|
||||
#elif 1
|
||||
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
|
||||
#elif 0
|
||||
@@ -633,7 +670,6 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
#if 1
|
||||
if(Y == 3 && X == 3)
|
||||
{
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
|
||||
@@ -643,7 +679,6 @@ int main(int argc, char* argv[])
|
||||
host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
|
||||
}
|
||||
check_error(out_nkhw_host, out_nkhw_device);
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
|
||||
|
||||
18
src/include/Array.hip.hpp
Normal file
18
src/include/Array.hip.hpp
Normal file
@@ -0,0 +1,18 @@
|
||||
#pragma once
|
||||
|
||||
template <class TData, unsigned NSize>
|
||||
struct Array
|
||||
{
|
||||
using Type = Array<TData, NSize>;
|
||||
|
||||
static constexpr unsigned nSize = NSize;
|
||||
|
||||
unsigned mData[nSize];
|
||||
|
||||
template <class... Xs>
|
||||
__host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ TData operator[](unsigned i) const { return mData[i]; }
|
||||
};
|
||||
@@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
|
||||
template <class Lengths, class Strides>
|
||||
struct ConstantTensorDescriptor
|
||||
{
|
||||
using Type = ConstantTensorDescriptor<Lengths, Strides>;
|
||||
static constexpr unsigned nDim = Lengths::nDim;
|
||||
using NDimConstant = Number<nDim>;
|
||||
|
||||
__host__ __device__ constexpr ConstantTensorDescriptor()
|
||||
{
|
||||
@@ -91,293 +91,70 @@ struct ConstantTensorDescriptor
|
||||
return Strides{}.Get(Number<I>{});
|
||||
}
|
||||
|
||||
// c++14 doesn't support constexpr lambdas, has to use this trick instead
|
||||
struct GetElementSize_f
|
||||
{
|
||||
template <class IDim>
|
||||
__host__ __device__ constexpr unsigned operator()(IDim idim) const
|
||||
{
|
||||
return Type{}.GetLength(idim);
|
||||
}
|
||||
};
|
||||
|
||||
__host__ __device__ constexpr unsigned GetElementSize() const
|
||||
{
|
||||
static_assert(nDim >= 2 && nDim <= 8, "nDim");
|
||||
|
||||
if(nDim == 2)
|
||||
// c++14 doesn't support constexpr lambdas, has to use this trick instead
|
||||
struct multiply
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
__host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const
|
||||
{
|
||||
return a * b;
|
||||
}
|
||||
};
|
||||
|
||||
return GetLength(I0) * GetLength(I1);
|
||||
}
|
||||
else if(nDim == 3)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
|
||||
return GetLength(I0) * GetLength(I1) * GetLength(I2);
|
||||
}
|
||||
else if(nDim == 4)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3);
|
||||
}
|
||||
else if(nDim == 5)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4);
|
||||
}
|
||||
else if(nDim == 6)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
|
||||
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
|
||||
GetLength(I5);
|
||||
}
|
||||
else if(nDim == 7)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
|
||||
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
|
||||
GetLength(I5) * GetLength(I6);
|
||||
}
|
||||
else if(nDim == 8)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
|
||||
GetLength(I5) * GetLength(I6) * GetLength(I7);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false);
|
||||
}
|
||||
return static_const_reduce_n<nDim>{}(GetElementSize_f{}, multiply{});
|
||||
}
|
||||
|
||||
// c++14 doesn't support constexpr lambdas, has to use this trick instead
|
||||
struct GetElementSpace_f
|
||||
{
|
||||
template <class IDim>
|
||||
__host__ __device__ constexpr unsigned operator()(IDim idim) const
|
||||
{
|
||||
return (Type{}.GetLength(idim) - 1) * Type{}.GetStride(idim);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Align = Number<1>>
|
||||
__host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const
|
||||
{
|
||||
static_assert(nDim >= 2 && nDim <= 8, "nDim");
|
||||
|
||||
constexpr unsigned align_size = align.Get();
|
||||
|
||||
if(nDim == 2)
|
||||
// c++14 doesn't support constexpr lambdas, has to use this trick instead
|
||||
struct add
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
__host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
};
|
||||
|
||||
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
|
||||
align_size;
|
||||
}
|
||||
else if(nDim == 3)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
|
||||
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
|
||||
(GetLength(I2) - 1) * GetStride(I2) + align_size;
|
||||
}
|
||||
else if(nDim == 4)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
|
||||
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
|
||||
align_size;
|
||||
}
|
||||
else if(nDim == 5)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
|
||||
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
|
||||
(GetLength(I4) - 1) * GetStride(I4) + align_size;
|
||||
}
|
||||
else if(nDim == 6)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
|
||||
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
|
||||
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
|
||||
(GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
|
||||
align_size;
|
||||
}
|
||||
else if(nDim == 7)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
|
||||
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
|
||||
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
|
||||
(GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
|
||||
(GetLength(I6) - 1) * GetStride(I6) + align_size;
|
||||
}
|
||||
else if(nDim == 8)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
|
||||
(GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
|
||||
(GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
|
||||
(GetLength(I6) - 1) * GetStride(I6) + (GetLength(I7) - 1) * GetStride(I7) +
|
||||
align_size;
|
||||
}
|
||||
return static_const_reduce_n<nDim>{}(GetElementSpace_f{}, add{}) + align.Get();
|
||||
}
|
||||
|
||||
// this is ugly, only for 2d
|
||||
__host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1) const
|
||||
template <class... Is>
|
||||
__host__ __device__ unsigned Get1dIndex(Is... is) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong");
|
||||
|
||||
static_assert(nDim == 2, "nDim is not 2");
|
||||
return i0 * GetStride(I0) + i1 * GetStride(I1);
|
||||
}
|
||||
const auto multi_id = Array<unsigned, nDim>(is...);
|
||||
|
||||
// this is ugly, only for 3d
|
||||
__host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1, unsigned i2) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
unsigned id = 0;
|
||||
|
||||
static_assert(nDim == 3, "nDim is not 3");
|
||||
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2);
|
||||
}
|
||||
static_loop_n<nDim>{}([&](auto IDim) {
|
||||
constexpr unsigned idim = IDim.Get();
|
||||
id += multi_id[idim] * GetStride(IDim);
|
||||
});
|
||||
|
||||
// this is ugly, only for 4d
|
||||
__host__ __device__ unsigned
|
||||
Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
static_assert(nDim == 4, "nDim is not 4");
|
||||
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3);
|
||||
}
|
||||
|
||||
// this is ugly, only for 5d
|
||||
__host__ __device__ unsigned
|
||||
Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
static_assert(nDim == 5, "nDim is not 5");
|
||||
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
|
||||
i4 * GetStride(I4);
|
||||
}
|
||||
|
||||
// this is ugly, only for 6d
|
||||
__host__ __device__ unsigned
|
||||
Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4, unsigned i5) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
|
||||
static_assert(nDim == 6, "nDim is not 6");
|
||||
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
|
||||
i4 * GetStride(I4) + i5 * GetStride(I5);
|
||||
}
|
||||
|
||||
// this is ugly, only for 7d
|
||||
__host__ __device__ unsigned Get1dIndex(unsigned i0,
|
||||
unsigned i1,
|
||||
unsigned i2,
|
||||
unsigned i3,
|
||||
unsigned i4,
|
||||
unsigned i5,
|
||||
unsigned i6) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
|
||||
static_assert(nDim == 7, "nDim is not 7");
|
||||
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
|
||||
i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6);
|
||||
}
|
||||
|
||||
// this is ugly, only for 8d
|
||||
__host__ __device__ unsigned Get1dIndex(unsigned i0,
|
||||
unsigned i1,
|
||||
unsigned i2,
|
||||
unsigned i3,
|
||||
unsigned i4,
|
||||
unsigned i5,
|
||||
unsigned i6,
|
||||
unsigned i7) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
static_assert(nDim == 8, "nDim is not 8");
|
||||
return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
|
||||
i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6) + i7 * GetStride(I7);
|
||||
return id;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr auto Condense() const
|
||||
@@ -385,6 +162,12 @@ struct ConstantTensorDescriptor
|
||||
constexpr auto default_strides = calculate_default_strides(Lengths{});
|
||||
return ConstantTensorDescriptor<Lengths, decltype(default_strides)>{};
|
||||
}
|
||||
|
||||
template <unsigned IDim, unsigned NVector>
|
||||
__host__ __device__ constexpr auto Vectorize(Number<IDim>, Number<NVector>) const
|
||||
{
|
||||
assert(false); // not implemented
|
||||
}
|
||||
};
|
||||
|
||||
template <class Lengths>
|
||||
|
||||
92
src/include/Sequence.hip.hpp
Normal file
92
src/include/Sequence.hip.hpp
Normal file
@@ -0,0 +1,92 @@
|
||||
#pragma once
|
||||
#include "constant_integral.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
|
||||
template <unsigned... Is>
|
||||
struct Sequence
|
||||
{
|
||||
using Type = Sequence<Is...>;
|
||||
|
||||
static constexpr unsigned nDim = sizeof...(Is);
|
||||
|
||||
const unsigned mData[nDim] = {Is...};
|
||||
|
||||
template <unsigned I>
|
||||
__host__ __device__ constexpr unsigned Get(Number<I>) const
|
||||
{
|
||||
return mData[I];
|
||||
}
|
||||
|
||||
// this is ugly, only for nDIm = 4
|
||||
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
|
||||
__host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence<I0, I1, I2, I3>) const
|
||||
{
|
||||
static_assert(nDim == 4, "nDim != 4");
|
||||
|
||||
constexpr auto old_sequence = Type{};
|
||||
|
||||
constexpr unsigned NR0 = old_sequence.mData[I0];
|
||||
constexpr unsigned NR1 = old_sequence.mData[I1];
|
||||
constexpr unsigned NR2 = old_sequence.mData[I2];
|
||||
constexpr unsigned NR3 = old_sequence.mData[I3];
|
||||
|
||||
return Sequence<NR0, NR1, NR2, NR3>{};
|
||||
}
|
||||
|
||||
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
|
||||
__host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence<I0, I1, I2, I3>) const
|
||||
{
|
||||
// don't know how to implement this
|
||||
printf("Sequence::ReorderByPutOldToNew not implemented");
|
||||
assert(false);
|
||||
}
|
||||
|
||||
template <unsigned I>
|
||||
__host__ __device__ constexpr auto PushBack(Number<I>) const
|
||||
{
|
||||
return Sequence<Is..., I>{};
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr auto PopBack() const;
|
||||
|
||||
template <class F>
|
||||
__host__ __device__ constexpr auto Transform(F f) const
|
||||
{
|
||||
return Sequence<f(Is)...>{};
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned... Is, unsigned I>
|
||||
__host__ __device__ constexpr auto sequence_pop_back(Sequence<Is..., I>)
|
||||
{
|
||||
static_assert(sizeof...(Is) >= 1, "empty Sequence!");
|
||||
return Sequence<Is...>{};
|
||||
}
|
||||
|
||||
template <class F, unsigned... Xs, unsigned... Ys>
|
||||
__host__ __device__ constexpr auto sequence_sequence_op(Sequence<Xs...>, Sequence<Ys...>, F f)
|
||||
{
|
||||
static_assert(Sequence<Xs...>::nDim == Sequence<Ys...>::nDim, "Dim not the same");
|
||||
|
||||
return Sequence<f(Xs, Ys)...>{};
|
||||
}
|
||||
|
||||
template <unsigned... Xs, unsigned... Ys>
|
||||
__host__ __device__ constexpr auto sequence_sequence_add(Sequence<Xs...>, Sequence<Ys...>)
|
||||
{
|
||||
struct add
|
||||
{
|
||||
__host__ __device__ constexpr unsigned operator()(unsigned x, unsigned y) const
|
||||
{
|
||||
return x + y;
|
||||
}
|
||||
};
|
||||
|
||||
return sequence_sequence_op(Sequence<Xs...>{}, Sequence<Ys...>{}, add{});
|
||||
}
|
||||
|
||||
template <unsigned... Is>
|
||||
__host__ __device__ constexpr auto Sequence<Is...>::PopBack() const
|
||||
{
|
||||
return sequence_pop_back(Type{});
|
||||
}
|
||||
@@ -373,7 +373,7 @@ template <unsigned BlockSize,
|
||||
unsigned DataPerRead>
|
||||
struct Blockwise2dTensorCopy3
|
||||
{
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::type;
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
|
||||
|
||||
unsigned mSrcMyThreadOffset;
|
||||
unsigned mDstMyThreadOffset;
|
||||
@@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1,
|
||||
"wrong! only support stride1 == 1!\n");
|
||||
static_assert(DataPerRead == 1 ||
|
||||
(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1),
|
||||
"wrong! only support stride1 == 1 if DataPerRead > 1!\n");
|
||||
|
||||
static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
|
||||
"wrong! only support DataPerRead == 1, 2 or 4!\n");
|
||||
|
||||
@@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
|
||||
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
|
||||
const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
|
||||
f(p_src[aindex], p_dst[bindex]);
|
||||
f(p_src[src_index], p_dst[dst_index]);
|
||||
}
|
||||
|
||||
constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
|
||||
@@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
|
||||
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
|
||||
const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
|
||||
f(p_src[aindex], p_dst[bindex]);
|
||||
f(p_src[src_index], p_dst[dst_index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -199,15 +199,110 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
|
||||
}
|
||||
|
||||
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
|
||||
template <unsigned BlockSize,
|
||||
class Float,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class CopyLengths,
|
||||
unsigned DataPerRead>
|
||||
struct Blockwise4dTensorCopy1
|
||||
{
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
|
||||
|
||||
__device__ constexpr Blockwise4dTensorCopy1()
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
static_assert(DataPerRead == 1 ||
|
||||
(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1),
|
||||
"wrong! only support stride3 == 1 if DataPerRead > 1!\n");
|
||||
|
||||
static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
|
||||
"wrong! only support DataPerRead == 1, 2 or 4!\n");
|
||||
|
||||
static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
|
||||
DstDesc{}.GetStride(I2) % DataPerRead == 0,
|
||||
"src and dst stride2 should be multiple of DataPerRead to keep alignment");
|
||||
|
||||
// we allow out-of-bound read from src in D3 dimension,
|
||||
// but we need to make sure dst stride2 is big enough,
|
||||
// so that the out-of-bound write won't contaminate next line in dst
|
||||
constexpr unsigned L3 = CopyLengths{}.Get(I3);
|
||||
constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead);
|
||||
|
||||
static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2),
|
||||
"wrong! out-of-bound write will contaminate next line!\n");
|
||||
}
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
{
|
||||
constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
|
||||
constexpr unsigned L0 = CopyLengths{}.Get(I0);
|
||||
constexpr unsigned L1 = CopyLengths{}.Get(I1);
|
||||
constexpr unsigned L2 = CopyLengths{}.Get(I2);
|
||||
constexpr unsigned L3 = CopyLengths{}.Get(I3);
|
||||
|
||||
constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead);
|
||||
|
||||
constexpr auto ref_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
|
||||
|
||||
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
|
||||
|
||||
auto f_copy = [&](unsigned is) {
|
||||
unsigned did[4];
|
||||
|
||||
did[0] = is / ref_desc.GetStride(I0);
|
||||
|
||||
is -= did[0] * ref_desc.GetStride(I0);
|
||||
|
||||
did[1] = is / ref_desc.GetStride(I1);
|
||||
|
||||
is -= did[1] * ref_desc.GetStride(I1);
|
||||
|
||||
did[2] = is / ref_desc.GetStride(I2);
|
||||
|
||||
is -= did[2] * ref_desc.GetStride(I2);
|
||||
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const unsigned src_index =
|
||||
src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
|
||||
const unsigned dst_index =
|
||||
dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_src + src_index));
|
||||
};
|
||||
|
||||
for(unsigned iloop = 0; iloop < NLoop; ++iloop)
|
||||
{
|
||||
unsigned is = threadIdx.x + iloop * BlockSize;
|
||||
|
||||
f_copy(is);
|
||||
}
|
||||
|
||||
constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
|
||||
|
||||
if(has_tail)
|
||||
{
|
||||
unsigned is = threadIdx.x + NLoop * BlockSize;
|
||||
|
||||
if(is < ref_desc.GetElementSize())
|
||||
{
|
||||
f_copy(is);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -350,7 +445,7 @@ template <unsigned BlockSize,
|
||||
unsigned DataPerRead>
|
||||
struct Blockwise4dTensorCopy3
|
||||
{
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::type;
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
|
||||
|
||||
unsigned mSrcMyThreadOffset;
|
||||
unsigned mDstMyThreadOffset;
|
||||
@@ -362,8 +457,9 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1,
|
||||
"wrong! only support stride3 == 1!\n");
|
||||
static_assert(DataPerRead == 1 ||
|
||||
(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1),
|
||||
"wrong! only support stride3 == 1 if DataPerRead > 1!\n");
|
||||
|
||||
static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
|
||||
"wrong! only support DataPerRead == 1, 2 or 4!\n");
|
||||
@@ -371,7 +467,7 @@ struct Blockwise4dTensorCopy3
|
||||
static_assert(
|
||||
SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
|
||||
DstDesc{}.GetStride(I2) % DataPerRead == 0,
|
||||
"wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
|
||||
"wrong! src and dst stride2 should be multiple of DataPerRead to keep alignment");
|
||||
|
||||
constexpr unsigned L0 = CopyLengths{}.Get(I0);
|
||||
constexpr unsigned L1 = CopyLengths{}.Get(I1);
|
||||
|
||||
@@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
#pragma unroll
|
||||
for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
|
||||
{
|
||||
// read first batch of A, B
|
||||
// copy A-sub to form A
|
||||
#pragma unroll
|
||||
// read first batch of A, B
|
||||
// copy A-sub to form A
|
||||
//#pragma unroll
|
||||
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
|
||||
{
|
||||
#if 0
|
||||
threadwise_matrix_copy(
|
||||
a_block_mtx,
|
||||
p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
|
||||
@@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
a_thread_mtx,
|
||||
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
|
||||
a_thread_sub_mtx.GetLengths());
|
||||
#else
|
||||
for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
|
||||
{
|
||||
p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] =
|
||||
p_a_block[a_block_mtx.Get1dIndex(k_begin + i,
|
||||
m_repeat * MPerLevel1Cluster + j) +
|
||||
mMyThreadOffsetA];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// copy B-sub to form B
|
||||
#pragma unroll
|
||||
// copy B-sub to form B
|
||||
//#pragma unroll
|
||||
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
|
||||
{
|
||||
#if 0
|
||||
threadwise_matrix_copy(
|
||||
b_block_mtx,
|
||||
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
|
||||
@@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
b_thread_mtx,
|
||||
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
|
||||
b_thread_sub_mtx.GetLengths());
|
||||
#else
|
||||
for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
|
||||
{
|
||||
p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] =
|
||||
p_b_block[b_block_mtx.Get1dIndex(k_begin + i,
|
||||
n_repeat * MPerLevel1Cluster + j) +
|
||||
mMyThreadOffsetB];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// loop over batch
|
||||
#pragma unroll
|
||||
// loop over batch
|
||||
//#pragma unroll
|
||||
for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
|
||||
{
|
||||
// do current batch of gemm
|
||||
// do current batch of gemm
|
||||
#if 0
|
||||
threadwise_gemm(a_thread_mtx,
|
||||
True,
|
||||
p_a_thread,
|
||||
@@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
False,
|
||||
p_c_thread + ib * ThreadMatrixStrideC,
|
||||
f_accum);
|
||||
#else
|
||||
for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
|
||||
{
|
||||
for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j)
|
||||
{
|
||||
const unsigned aindex =
|
||||
a_thread_mtx.Get1dIndex(k, i); // A is transposed
|
||||
const unsigned bindex = b_thread_mtx.Get1dIndex(k, j);
|
||||
const unsigned cindex =
|
||||
c_thread_mtx.Get1dIndex(i, j) + ib * ThreadMatrixStrideC;
|
||||
|
||||
f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// read next batch of a, b
|
||||
if(BlockMatrixStrideA != 0)
|
||||
{
|
||||
#pragma unroll
|
||||
//#pragma unroll
|
||||
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
|
||||
{
|
||||
#if 0
|
||||
threadwise_matrix_copy(
|
||||
a_block_mtx,
|
||||
p_a_block +
|
||||
@@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
a_thread_mtx,
|
||||
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
|
||||
a_thread_sub_mtx.GetLengths());
|
||||
#else
|
||||
for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
|
||||
{
|
||||
p_a_thread[a_thread_mtx.Get1dIndex(i,
|
||||
m_repeat * MPerThreadSubC + j)] =
|
||||
p_a_block[a_block_mtx.Get1dIndex(
|
||||
k_begin + i, m_repeat * MPerLevel1Cluster + j) +
|
||||
(ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if(BlockMatrixStrideB != 0)
|
||||
{
|
||||
#pragma unroll
|
||||
//#pragma unroll
|
||||
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
|
||||
{
|
||||
#if 0
|
||||
threadwise_matrix_copy(
|
||||
b_block_mtx,
|
||||
p_b_block +
|
||||
@@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
b_thread_mtx,
|
||||
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
|
||||
b_thread_sub_mtx.GetLengths());
|
||||
#else
|
||||
for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
|
||||
{
|
||||
p_b_thread[b_thread_mtx.Get1dIndex(i,
|
||||
n_repeat * NPerThreadSubC + j)] =
|
||||
p_b_block[b_block_mtx.Get1dIndex(
|
||||
k_begin + i, n_repeat * MPerLevel1Cluster + j) +
|
||||
(ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// do last batch of gemm
|
||||
// do last batch of gemm
|
||||
#if 0
|
||||
threadwise_gemm(a_thread_mtx,
|
||||
True,
|
||||
p_a_thread,
|
||||
@@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
False,
|
||||
p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
|
||||
f_accum);
|
||||
#else
|
||||
for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
|
||||
{
|
||||
for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j)
|
||||
{
|
||||
const unsigned aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed
|
||||
const unsigned bindex = b_thread_mtx.Get1dIndex(k, j);
|
||||
const unsigned cindex = c_thread_mtx.Get1dIndex(i, j) +
|
||||
(BatchPerThread - 1) * ThreadMatrixStrideC;
|
||||
|
||||
f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -551,9 +642,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
c_thread_mtx_begin.batch * BlockMatrixStrideC +
|
||||
c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
|
||||
|
||||
for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat)
|
||||
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
|
||||
{
|
||||
for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat)
|
||||
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
|
||||
{
|
||||
threadwise_matrix_copy(
|
||||
c_thread_sub_mtx,
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
#pragma once
|
||||
#include "data_type.hip.hpp"
|
||||
#include "constant_integral.hip.hpp"
|
||||
#include "Sequence.hip.hpp"
|
||||
#include "Array.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
|
||||
__device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
|
||||
|
||||
@@ -16,129 +21,6 @@ struct is_same<T, T>
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
template <class T, unsigned N>
|
||||
struct vector_type
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 1>
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 2>
|
||||
{
|
||||
using type = float2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 4>
|
||||
{
|
||||
using type = float4;
|
||||
};
|
||||
|
||||
#if 0
|
||||
template <>
|
||||
struct vector_type<half_float::half, 1>
|
||||
{
|
||||
using type = half_float::half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half_float::half, 2>
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half_float::half, 4>
|
||||
{
|
||||
using type = float2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half_float::half, 8>
|
||||
{
|
||||
using type = float4;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
template <>
|
||||
struct vector_type<half, 1>
|
||||
{
|
||||
using type = half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 2>
|
||||
{
|
||||
using type = half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 4>
|
||||
{
|
||||
using type = float2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 8>
|
||||
{
|
||||
using type = float4;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class T, T N>
|
||||
struct integral_constant
|
||||
{
|
||||
static const T value = N;
|
||||
|
||||
__host__ __device__ constexpr T Get() const { return value; }
|
||||
};
|
||||
|
||||
template <unsigned N>
|
||||
using Number = integral_constant<unsigned, N>;
|
||||
|
||||
template <unsigned... Is>
|
||||
struct Sequence
|
||||
{
|
||||
using Type = Sequence<Is...>;
|
||||
|
||||
static constexpr unsigned nDim = sizeof...(Is);
|
||||
|
||||
const unsigned mData[nDim] = {Is...};
|
||||
|
||||
template <unsigned I>
|
||||
__host__ __device__ constexpr unsigned Get(Number<I>) const
|
||||
{
|
||||
return mData[I];
|
||||
}
|
||||
|
||||
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
|
||||
__host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence<I0, I1, I2, I3>) const
|
||||
{
|
||||
constexpr auto old_sequence = Type{};
|
||||
|
||||
constexpr unsigned NR0 = old_sequence.mData[I0];
|
||||
constexpr unsigned NR1 = old_sequence.mData[I1];
|
||||
constexpr unsigned NR2 = old_sequence.mData[I2];
|
||||
constexpr unsigned NR3 = old_sequence.mData[I3];
|
||||
|
||||
return Sequence<NR0, NR1, NR2, NR3>{};
|
||||
}
|
||||
|
||||
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
|
||||
__host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence<I0, I1, I2, I3>) const
|
||||
{
|
||||
// don't know how to implement this
|
||||
printf("Sequence::ReorderByPutOldToNew not implemented");
|
||||
assert(false);
|
||||
}
|
||||
};
|
||||
|
||||
#if DEVICE_BACKEND_CUDA
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr T max(T a, T b)
|
||||
|
||||
@@ -4,9 +4,10 @@
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#include "cuda_runtime.h"
|
||||
#include "cuda_fp16.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "helper_cuda.h"
|
||||
#include "cuda_fp16.h"
|
||||
#endif
|
||||
|
||||
12
src/include/constant_integral.hip.hpp
Normal file
12
src/include/constant_integral.hip.hpp
Normal file
@@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
template <class T, T N>
|
||||
struct integral_constant
|
||||
{
|
||||
static const T value = N;
|
||||
|
||||
__host__ __device__ constexpr T Get() const { return value; }
|
||||
};
|
||||
|
||||
template <unsigned N>
|
||||
using Number = integral_constant<unsigned, N>;
|
||||
214
src/include/data_type.hip.hpp
Normal file
214
src/include/data_type.hip.hpp
Normal file
@@ -0,0 +1,214 @@
|
||||
#pragma once
|
||||
#include "config.h"
|
||||
|
||||
template <class T, unsigned N>
|
||||
struct vector_type
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 1>
|
||||
{
|
||||
using MemoryType = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 2>
|
||||
{
|
||||
using MemoryType = float2;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(float s0, float s1)
|
||||
{
|
||||
union
|
||||
{
|
||||
MemoryType vector;
|
||||
float scalar[2];
|
||||
} data;
|
||||
|
||||
data.scalar[0] = s0;
|
||||
data.scalar[1] = s1;
|
||||
return data.vector;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 4>
|
||||
{
|
||||
using MemoryType = float4;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float2, 2>
|
||||
{
|
||||
using MemoryType = float4;
|
||||
};
|
||||
|
||||
#if 0
|
||||
template <>
|
||||
struct vector_type<half, 1>
|
||||
{
|
||||
using MemoryType = half;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(half s) { return s; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 2>
|
||||
{
|
||||
using MemoryType = half2;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(half s0, half s1)
|
||||
{
|
||||
union
|
||||
{
|
||||
MemoryType vector;
|
||||
half scalar[2];
|
||||
} data;
|
||||
|
||||
data.scalar[0] = s0;
|
||||
data.scalar[1] = s1;
|
||||
return data.vector;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 4>
|
||||
{
|
||||
using MemoryType = float2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 8>
|
||||
{
|
||||
using MemoryType = float4;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char, 1>
|
||||
{
|
||||
using MemoryType = char;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(char s) { return s; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char, 2>
|
||||
{
|
||||
using MemoryType = int16_t;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(char s0, char s1)
|
||||
{
|
||||
union
|
||||
{
|
||||
MemoryType vector;
|
||||
char scalar[2];
|
||||
} data;
|
||||
|
||||
data.scalar[0] = s0;
|
||||
data.scalar[1] = s1;
|
||||
return data.vector;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char, 4>
|
||||
{
|
||||
using MemoryType = int32_t;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
|
||||
{
|
||||
union
|
||||
{
|
||||
MemoryType vector;
|
||||
char scalar[4];
|
||||
} data;
|
||||
|
||||
data.scalar[0] = s0;
|
||||
data.scalar[1] = s1;
|
||||
data.scalar[2] = s2;
|
||||
data.scalar[3] = s3;
|
||||
return data.vector;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char, 8>
|
||||
{
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<int32_t, 2>
|
||||
{
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char2, 2>
|
||||
{
|
||||
using MemoryType = char4;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char2, 4>
|
||||
{
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char4, 1>
|
||||
{
|
||||
using MemoryType = int;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char4, 2>
|
||||
{
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
#endif
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
|
||||
{
|
||||
d += s0 * s1;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
|
||||
{
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
|
||||
{
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
d += s0.z * s1.z;
|
||||
d += s0.w * s1.w;
|
||||
}
|
||||
|
||||
#if 0
|
||||
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
|
||||
|
||||
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x + s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
|
||||
|
||||
// TODO:: this interface is misleading, s0, s1 are actually int8x4
|
||||
// need to make a better interface
|
||||
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
|
||||
{
|
||||
#if DEVICE_BACKEND_CUDA
|
||||
d = __dp4a(s0, s1, d);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
57
src/include/functional.hip.hpp
Normal file
57
src/include/functional.hip.hpp
Normal file
@@ -0,0 +1,57 @@
|
||||
#pragma once
|
||||
#include "constant_integral.hip.hpp"
|
||||
|
||||
template <unsigned NLoop>
|
||||
struct static_loop_n
|
||||
{
|
||||
template <class F>
|
||||
__host__ __device__ void operator()(F f) const
|
||||
{
|
||||
static_assert(NLoop > 1, "out-of-range");
|
||||
|
||||
f(Number<NLoop - 1>{});
|
||||
static_loop_n<NLoop - 1>{}(f);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct static_loop_n<1>
|
||||
{
|
||||
template <class F>
|
||||
__host__ __device__ void operator()(F f) const
|
||||
{
|
||||
f(Number<0>{});
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned NLoop>
|
||||
struct static_const_reduce_n
|
||||
{
|
||||
template <class F, class Reduce>
|
||||
__host__ __device__ constexpr auto operator()(F f, Reduce r) const
|
||||
{
|
||||
static_assert(NLoop > 1, "out-of-range");
|
||||
|
||||
constexpr auto a = f(Number<NLoop - 1>{});
|
||||
auto b = static_const_reduce_n<NLoop - 1>{}(f, r); // cannot use constexpr here, weird
|
||||
return r(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct static_const_reduce_n<1>
|
||||
{
|
||||
template <class F, class Reduce>
|
||||
__host__ __device__ constexpr auto operator()(F f, Reduce) const
|
||||
{
|
||||
return f(Number<0>{});
|
||||
}
|
||||
};
|
||||
|
||||
#if 0
|
||||
template<class F>
|
||||
__host__ __device__ constexpr auto unpacker(F f)
|
||||
{
|
||||
return [=](auto xs_array){ f(xs...); };
|
||||
}
|
||||
#endif
|
||||
@@ -1,198 +0,0 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_direct_convolution.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_direct_convolution.hip.hpp"
|
||||
|
||||
template <class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
unsigned NPerBlock,
|
||||
unsigned KPerBlock,
|
||||
unsigned CPerBlock,
|
||||
unsigned HoPerBlock,
|
||||
unsigned WoPerBlock,
|
||||
unsigned NPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread,
|
||||
unsigned HoPerThread,
|
||||
unsigned WoPerThread,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr unsigned Y = wei_global_desc.GetLength(I2);
|
||||
constexpr unsigned X = wei_global_desc.GetLength(I3);
|
||||
|
||||
constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
|
||||
constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
|
||||
|
||||
constexpr auto in_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
|
||||
|
||||
constexpr auto wei_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{});
|
||||
|
||||
// shared mem
|
||||
constexpr unsigned in_block_size = in_block_desc.GetElementSpace();
|
||||
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
|
||||
|
||||
__shared__ Float p_in_block[in_block_size];
|
||||
__shared__ Float p_wei_block[wei_block_size];
|
||||
|
||||
// threadwise tensors
|
||||
constexpr unsigned HiPerThread = HoPerThread + Y - 1;
|
||||
constexpr unsigned WiPerThread = WoPerThread + X - 1;
|
||||
|
||||
constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{}, in_block_desc.GetStrides());
|
||||
|
||||
constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, CPerThread, Y, X>{}, wei_block_desc.GetStrides());
|
||||
|
||||
constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
|
||||
in_thread_block_desc, wei_thread_block_desc);
|
||||
|
||||
// register
|
||||
Float p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
// divide block work
|
||||
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
|
||||
constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
|
||||
constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
|
||||
constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
|
||||
|
||||
const unsigned block_id = blockIdx.x;
|
||||
|
||||
unsigned itmp = block_id;
|
||||
const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
|
||||
itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
|
||||
const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
|
||||
itmp -= k_block_work_id * (HBlockWork * WBlockWork);
|
||||
const unsigned h_block_work_id = itmp / WBlockWork;
|
||||
const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
|
||||
|
||||
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
|
||||
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
|
||||
|
||||
const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
|
||||
const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
|
||||
|
||||
// divide thread work
|
||||
constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
|
||||
constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
|
||||
constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
|
||||
constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
|
||||
|
||||
const unsigned thread_id = threadIdx.x;
|
||||
|
||||
itmp = thread_id;
|
||||
const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
|
||||
itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
|
||||
const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
|
||||
itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
|
||||
const unsigned h_thread_work_id = itmp / WThreadWork;
|
||||
const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
|
||||
|
||||
const unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
|
||||
const unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
|
||||
const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
|
||||
const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
|
||||
|
||||
const unsigned hi_thread_data_begin = ho_thread_data_begin;
|
||||
const unsigned wi_thread_data_begin = wo_thread_data_begin;
|
||||
|
||||
constexpr auto blockwise_in_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(in_global_desc),
|
||||
decltype(in_block_desc),
|
||||
decltype(in_block_desc.GetLengths())>{};
|
||||
|
||||
constexpr auto blockwise_wei_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_global_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(wei_block_desc.GetLengths())>{};
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_in_copy.Run(p_in_global +
|
||||
in_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_block);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
|
||||
{
|
||||
// threadwise convolution
|
||||
#if 1
|
||||
threadwise_direct_convolution_2(
|
||||
in_thread_block_desc,
|
||||
p_in_block +
|
||||
in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
#elif 0
|
||||
threadwise_direct_convolution_3(
|
||||
in_thread_block_desc,
|
||||
p_in_block +
|
||||
in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// copy output tensor from register to global mem
|
||||
threadwise_4d_tensor_copy(
|
||||
out_thread_desc,
|
||||
p_out_thread,
|
||||
out_global_desc,
|
||||
p_out_global +
|
||||
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_thread_desc.GetLengths());
|
||||
}
|
||||
238
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
Normal file
238
src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
Normal file
@@ -0,0 +1,238 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_direct_convolution.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_direct_convolution.hip.hpp"
|
||||
|
||||
template <class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
unsigned NPerBlock,
|
||||
unsigned KPerBlock,
|
||||
unsigned CPerBlock,
|
||||
unsigned HoPerBlock,
|
||||
unsigned WoPerBlock,
|
||||
unsigned NPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread,
|
||||
unsigned HoPerThread,
|
||||
unsigned WoPerThread,
|
||||
unsigned InBlockCopyDataPerRead,
|
||||
unsigned WeiBlockCopyDataPerRead,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void
|
||||
gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_nchw_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
|
||||
constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
|
||||
constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
|
||||
constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
|
||||
constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
|
||||
|
||||
constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
|
||||
|
||||
constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
|
||||
constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
|
||||
|
||||
constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<KPerBlock, CPerBlock * Y * X>{},
|
||||
Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
|
||||
|
||||
constexpr auto wei_kcyx_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
|
||||
Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
|
||||
|
||||
// shared mem
|
||||
constexpr unsigned in_block_size =
|
||||
in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
|
||||
constexpr unsigned wei_block_size =
|
||||
wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
|
||||
? InBlockCopyDataPerRead
|
||||
: WeiBlockCopyDataPerRead;
|
||||
|
||||
__shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
|
||||
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
|
||||
|
||||
// threadwise tensors
|
||||
constexpr unsigned HiPerThread = HoPerThread + Y - 1;
|
||||
constexpr unsigned WiPerThread = WoPerThread + X - 1;
|
||||
|
||||
constexpr auto in_nchw_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
|
||||
in_nchw_block_desc.GetStrides());
|
||||
|
||||
constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());
|
||||
|
||||
constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
|
||||
in_nchw_thread_block_desc, wei_kcyx_thread_block_desc);
|
||||
|
||||
// register
|
||||
Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
|
||||
|
||||
// divide block work
|
||||
constexpr unsigned NBlockWork =
|
||||
(out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
|
||||
constexpr unsigned KBlockWork =
|
||||
(out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
|
||||
constexpr unsigned HBlockWork =
|
||||
(out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
|
||||
constexpr unsigned WBlockWork =
|
||||
(out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
|
||||
|
||||
const unsigned block_id = blockIdx.x;
|
||||
|
||||
unsigned itmp = block_id;
|
||||
const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
|
||||
itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
|
||||
const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
|
||||
itmp -= k_block_work_id * (HBlockWork * WBlockWork);
|
||||
const unsigned h_block_work_id = itmp / WBlockWork;
|
||||
const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
|
||||
|
||||
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
|
||||
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
|
||||
|
||||
const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
|
||||
const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
|
||||
|
||||
// divide thread work
|
||||
constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
|
||||
constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
|
||||
constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
|
||||
constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
|
||||
|
||||
const unsigned thread_id = threadIdx.x;
|
||||
|
||||
itmp = thread_id;
|
||||
const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
|
||||
itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
|
||||
const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
|
||||
itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
|
||||
const unsigned h_thread_work_id = itmp / WThreadWork;
|
||||
const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
|
||||
|
||||
const unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
|
||||
const unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
|
||||
const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
|
||||
const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
|
||||
|
||||
const unsigned hi_thread_data_begin = ho_thread_data_begin;
|
||||
const unsigned wi_thread_data_begin = wo_thread_data_begin;
|
||||
|
||||
constexpr auto blockwise_in_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(in_nchw_global_desc),
|
||||
decltype(in_nchw_block_desc),
|
||||
decltype(in_nchw_block_desc.GetLengths()),
|
||||
InBlockCopyDataPerRead>{};
|
||||
|
||||
#if 0
|
||||
constexpr auto blockwise_wei_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_kcyx_global_desc),
|
||||
decltype(wei_kcyx_block_desc),
|
||||
decltype(wei_kcyx_block_desc.GetLengths()),
|
||||
1>{};
|
||||
#elif 1
|
||||
const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
|
||||
Float,
|
||||
decltype(wei_ke_global_desc),
|
||||
decltype(wei_ke_block_desc),
|
||||
decltype(wei_ke_block_desc.GetLengths()),
|
||||
WeiBlockCopyDataPerRead>{};
|
||||
#endif
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_in_copy.Run(p_in_global +
|
||||
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_block);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_global +
|
||||
wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
|
||||
{
|
||||
// threadwise convolution
|
||||
#if 1
|
||||
threadwise_direct_convolution_2(
|
||||
in_nchw_thread_block_desc,
|
||||
p_in_block +
|
||||
in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_kcyx_thread_block_desc,
|
||||
p_wei_block +
|
||||
wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread);
|
||||
#elif 0
|
||||
threadwise_direct_convolution_3(
|
||||
in_nchw_thread_block_desc,
|
||||
p_in_block +
|
||||
in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_kcyx_thread_block_desc,
|
||||
p_wei_block +
|
||||
wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// copy output tensor from register to global mem
|
||||
threadwise_4d_tensor_copy(
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_nkhw_thread_desc.GetLengths());
|
||||
}
|
||||
@@ -0,0 +1,254 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_direct_convolution.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_direct_convolution.hip.hpp"
|
||||
|
||||
template <class TInWei,
|
||||
class TOut,
|
||||
class TAccum,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
unsigned ScalarPerVector,
|
||||
unsigned NPerBlock,
|
||||
unsigned KPerBlock,
|
||||
unsigned CPerBlock,
|
||||
unsigned HoPerBlock,
|
||||
unsigned WoPerBlock,
|
||||
unsigned NPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread,
|
||||
unsigned HoPerThread,
|
||||
unsigned WoPerThread,
|
||||
unsigned InBlockCopyDataPerRead,
|
||||
unsigned WeiBlockCopyDataPerRead,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
const typename vector_type<TInWei,
|
||||
ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
|
||||
const typename vector_type<TInWei,
|
||||
ScalarPerVector>::MemoryType* const __restrict__ p_wei_vec_global,
|
||||
TOut* const __restrict__ p_out_global)
|
||||
{
|
||||
using in_scalar_t = TInWei;
|
||||
using in_vector_mem_t = typename vector_type<in_scalar_t, ScalarPerVector>::MemoryType;
|
||||
using out_scalar_t = TOut;
|
||||
using accum_t = TAccum;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_nchw_vec_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_kcyx_vec_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr unsigned N = in_nchw_vec_global_desc.GetLength(I0);
|
||||
constexpr unsigned K = wei_kcyx_vec_global_desc.GetLength(I0);
|
||||
constexpr unsigned C = wei_kcyx_vec_global_desc.GetLength(I1);
|
||||
constexpr unsigned Y = wei_kcyx_vec_global_desc.GetLength(I2);
|
||||
constexpr unsigned X = wei_kcyx_vec_global_desc.GetLength(I3);
|
||||
|
||||
constexpr auto wei_ke_vec_global_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
|
||||
|
||||
constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
|
||||
constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
|
||||
|
||||
constexpr auto in_nchw_vec_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr auto wei_ke_vec_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<KPerBlock, CPerBlock * Y * X>{},
|
||||
Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
|
||||
|
||||
constexpr auto wei_kcyx_vec_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
|
||||
Sequence<wei_ke_vec_block_desc.GetStride(I0), Y * X, X, 1>{});
|
||||
|
||||
// shared mem
|
||||
constexpr unsigned in_block_size =
|
||||
in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr unsigned wei_block_size =
|
||||
wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
|
||||
? InBlockCopyDataPerRead
|
||||
: WeiBlockCopyDataPerRead;
|
||||
|
||||
__shared__ in_vector_mem_t
|
||||
p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
|
||||
__shared__ in_vector_mem_t
|
||||
p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
|
||||
|
||||
// threadwise tensors
|
||||
constexpr unsigned HiPerThread = HoPerThread + Y - 1;
|
||||
constexpr unsigned WiPerThread = WoPerThread + X - 1;
|
||||
|
||||
constexpr auto in_nchw_vec_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
|
||||
in_nchw_vec_block_desc.GetStrides());
|
||||
|
||||
constexpr auto wei_kcyx_vec_thread_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_vec_block_desc.GetStrides());
|
||||
|
||||
constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
|
||||
in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc);
|
||||
|
||||
// register
|
||||
out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
|
||||
|
||||
// divide block work
|
||||
constexpr unsigned NBlockWork =
|
||||
(out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
|
||||
constexpr unsigned KBlockWork =
|
||||
(out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
|
||||
constexpr unsigned HBlockWork =
|
||||
(out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
|
||||
constexpr unsigned WBlockWork =
|
||||
(out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
|
||||
|
||||
const unsigned block_id = blockIdx.x;
|
||||
|
||||
unsigned itmp = block_id;
|
||||
const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
|
||||
itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
|
||||
const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
|
||||
itmp -= k_block_work_id * (HBlockWork * WBlockWork);
|
||||
const unsigned h_block_work_id = itmp / WBlockWork;
|
||||
const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
|
||||
|
||||
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
|
||||
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
|
||||
|
||||
const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
|
||||
const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
|
||||
|
||||
// divide thread work
|
||||
constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
|
||||
constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
|
||||
constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
|
||||
constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
|
||||
|
||||
const unsigned thread_id = threadIdx.x;
|
||||
|
||||
itmp = thread_id;
|
||||
const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
|
||||
itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
|
||||
const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
|
||||
itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
|
||||
const unsigned h_thread_work_id = itmp / WThreadWork;
|
||||
const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
|
||||
|
||||
const unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
|
||||
const unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
|
||||
const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
|
||||
const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
|
||||
|
||||
const unsigned hi_thread_data_begin = ho_thread_data_begin;
|
||||
const unsigned wi_thread_data_begin = wo_thread_data_begin;
|
||||
|
||||
constexpr auto blockwise_in_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
in_vector_mem_t,
|
||||
decltype(in_nchw_vec_global_desc),
|
||||
decltype(in_nchw_vec_block_desc),
|
||||
decltype(in_nchw_vec_block_desc.GetLengths()),
|
||||
InBlockCopyDataPerRead>{};
|
||||
|
||||
#if 0
|
||||
constexpr auto blockwise_wei_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
in_vector_mem_t,
|
||||
decltype(wei_kcyx_vec_global_desc),
|
||||
decltype(wei_kcyx_vec_block_desc),
|
||||
decltype(wei_kcyx_vec_block_desc.GetLengths()),
|
||||
1>{};
|
||||
#elif 1
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy3<BlockSize,
|
||||
in_vector_mem_t,
|
||||
decltype(wei_ke_vec_global_desc),
|
||||
decltype(wei_ke_vec_block_desc),
|
||||
decltype(wei_ke_vec_block_desc.GetLengths()),
|
||||
WeiBlockCopyDataPerRead>{};
|
||||
#endif
|
||||
|
||||
#if 1 // debug
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
|
||||
#endif
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_in_copy.Run(p_in_vec_global +
|
||||
in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_vec_block);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_vec_global +
|
||||
wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
p_wei_vec_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
|
||||
{
|
||||
// threadwise convolution
|
||||
#if 1
|
||||
threadwise_direct_convolution_2(
|
||||
in_nchw_vec_thread_block_desc,
|
||||
p_in_vec_block +
|
||||
in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_kcyx_vec_thread_block_desc,
|
||||
p_wei_vec_block +
|
||||
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread);
|
||||
#elif 0
|
||||
threadwise_direct_convolution_3(
|
||||
in_nchw_vec_thread_block_desc,
|
||||
p_in_vec_block +
|
||||
in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_kcyx_vec_thread_block_desc,
|
||||
p_wei_vec_block +
|
||||
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// copy output tensor from register to global mem
|
||||
threadwise_4d_tensor_copy(
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_nkhw_thread_desc.GetLengths());
|
||||
}
|
||||
@@ -0,0 +1,281 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "threadwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
|
||||
// define B = flatten(N, Hi, Wi)
|
||||
template <unsigned GridSize,
|
||||
unsigned BlockSize,
|
||||
class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
unsigned BPerBlock,
|
||||
unsigned KPerBlock,
|
||||
unsigned CPerBlock,
|
||||
unsigned BPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned GemmThreadPerColumnPerCluster,
|
||||
unsigned GemmThreadPerRowPerCluster,
|
||||
unsigned GemmMPerThreadSubC,
|
||||
unsigned GemmNPerThreadSubC,
|
||||
unsigned GemmMLevel0Cluster,
|
||||
unsigned GemmNLevel0Cluster,
|
||||
unsigned GemmMLevel1Cluster,
|
||||
unsigned GemmNLevel1Cluster,
|
||||
unsigned GemmKPerThreadLoop,
|
||||
unsigned InBlockCopyThreadPerDim0,
|
||||
unsigned InBlockCopyThreadPerDim1,
|
||||
unsigned WeiBlockCopyThreadPerDim0,
|
||||
unsigned WeiBlockCopyThreadPerDim1,
|
||||
unsigned InBlockCopyDataPerRead,
|
||||
unsigned WeiBlockCopyDataPerRead>
|
||||
__global__ void
|
||||
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_chwn_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_khwn_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr unsigned C = in_chwn_global_desc.GetLength(I0);
|
||||
constexpr unsigned Hi = in_chwn_global_desc.GetLength(I1);
|
||||
constexpr unsigned Wi = in_chwn_global_desc.GetLength(I2);
|
||||
constexpr unsigned N = in_chwn_global_desc.GetLength(I3);
|
||||
|
||||
constexpr unsigned K = out_khwn_global_desc.GetLength(I0);
|
||||
constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
|
||||
constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
|
||||
|
||||
constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1);
|
||||
constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2);
|
||||
|
||||
constexpr unsigned B = N * Hi * Wi;
|
||||
constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1);
|
||||
|
||||
// divide block work by 2d: [K, B]
|
||||
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
|
||||
constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
|
||||
|
||||
const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
|
||||
const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
|
||||
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
|
||||
|
||||
// flattend (2d) tensor view of gridwise input
|
||||
constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence<C, B>{});
|
||||
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
|
||||
|
||||
// tensor view of blockwise input and weight
|
||||
// be careful of alignment
|
||||
constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock * Y * X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_kb_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
|
||||
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
|
||||
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
|
||||
|
||||
printf("KPerBlock %u\n", KPerBlock);
|
||||
}
|
||||
#endif
|
||||
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
#if 0
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths())>{};
|
||||
#elif 0
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths()),
|
||||
InBlockCopyThreadPerDim0,
|
||||
InBlockCopyThreadPerDim1>{};
|
||||
#elif 1
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths()),
|
||||
InBlockCopyDataPerRead>{};
|
||||
#endif
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*Y*X,KPerBlock]
|
||||
#if 0
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_ek_global_desc),
|
||||
decltype(wei_ek_block_desc),
|
||||
decltype(wei_ek_block_desc.GetLengths())>{};
|
||||
#elif 0
|
||||
const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
|
||||
Float,
|
||||
decltype(wei_ek_global_desc),
|
||||
decltype(wei_ek_block_desc),
|
||||
decltype(wei_ek_block_desc.GetLengths()),
|
||||
WeiBlockCopyThreadPerDim0,
|
||||
WeiBlockCopyThreadPerDim1>{};
|
||||
#elif 1
|
||||
const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
|
||||
Float,
|
||||
decltype(wei_ek_global_desc),
|
||||
decltype(wei_ek_block_desc),
|
||||
decltype(wei_ek_block_desc.GetLengths()),
|
||||
WeiBlockCopyDataPerRead>{};
|
||||
#endif
|
||||
|
||||
// a series of blockwise GEMM
|
||||
// c_mtx += transpose(a_mtx) * b_mtx
|
||||
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
|
||||
// a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K]
|
||||
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
|
||||
// c_mtx[K,B] is out_block[K,B]
|
||||
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_cyxk_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto c_kxb_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
|
||||
|
||||
const auto blockwise_gemm =
|
||||
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
|
||||
decltype(a_cxk_block_mtx_desc),
|
||||
decltype(b_cxb_block_mtx_desc),
|
||||
decltype(c_kxb_thread_mtx_desc),
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
GemmNLevel0Cluster,
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop>{};
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr unsigned in_block_size =
|
||||
in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr unsigned wei_block_size =
|
||||
wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
|
||||
? InBlockCopyDataPerRead
|
||||
: WeiBlockCopyDataPerRead;
|
||||
|
||||
// LDS
|
||||
__shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
|
||||
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
|
||||
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
// register
|
||||
Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
|
||||
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
|
||||
__syncthreads())
|
||||
{
|
||||
// load data
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// compute on current data
|
||||
// a series of GEMM
|
||||
for(unsigned y = 0; y < Y; ++y)
|
||||
{
|
||||
for(unsigned x = 0; x < X; ++x)
|
||||
{
|
||||
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#else
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#endif
|
||||
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block + y * Wi + x,
|
||||
p_out_thread,
|
||||
f_accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// output: register to global mem,
|
||||
const auto c_thread_mtx_begin =
|
||||
blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
|
||||
const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
|
||||
const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
|
||||
|
||||
for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
|
||||
{
|
||||
for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
|
||||
{
|
||||
const auto c_thread_mtx_distance =
|
||||
blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);
|
||||
|
||||
unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row;
|
||||
unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col;
|
||||
|
||||
unsigned h_data = b_data / (Wi * N);
|
||||
unsigned itmp = b_data - h_data * (Wi * N);
|
||||
unsigned w_data = itmp / N;
|
||||
unsigned n_data = itmp - w_data * N;
|
||||
|
||||
if(n_data < N && h_data < Ho && w_data < Wo)
|
||||
{
|
||||
p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] =
|
||||
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
|
||||
__syncthreads();
|
||||
|
||||
// load next data
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
|
||||
#elif 1
|
||||
@@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next);
|
||||
#endif
|
||||
|
||||
@@ -37,7 +37,8 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
|
||||
|
||||
// TODO: in order to optimize mem access for different mem type,
|
||||
// need to write specialized version
|
||||
template <class Float,
|
||||
template <class SrcData,
|
||||
class DstData,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SrcOpLengths,
|
||||
@@ -45,9 +46,9 @@ template <class Float,
|
||||
class F>
|
||||
__device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
|
||||
SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
const SrcData* __restrict__ p_src,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
DstData* __restrict__ p_dst,
|
||||
SrcOpLengths,
|
||||
DstFromSrcReorder,
|
||||
F f)
|
||||
@@ -88,33 +89,38 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_d
|
||||
}
|
||||
}
|
||||
|
||||
template <class Float, class Desc>
|
||||
__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
|
||||
template <class Data, class Desc>
|
||||
__device__ void threadwise_4d_tensor_set_zero(Desc, Data* __restrict__ p)
|
||||
{
|
||||
auto f_set_zero = [](Float& v) { v = Float(0); };
|
||||
auto f_set_zero = [](Data& v) { v = Data(0); };
|
||||
|
||||
threadwise_4d_tensor_pointwise_operation_unary<Float, Desc, decltype(f_set_zero)>(
|
||||
threadwise_4d_tensor_pointwise_operation_unary<Data, Desc, decltype(f_set_zero)>(
|
||||
Desc{}, p, f_set_zero);
|
||||
}
|
||||
|
||||
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, class DstFromSrcReorder>
|
||||
template <class SrcData,
|
||||
class DstData,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SrcOpLengths,
|
||||
class DstFromSrcReorder>
|
||||
__device__ void
|
||||
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
const SrcData* __restrict__ p_src,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
DstData* __restrict__ p_dst,
|
||||
SrcOpLengths,
|
||||
DstFromSrcReorder)
|
||||
{
|
||||
auto f_copy = [](const Float& src, Float& dst) { dst = src; };
|
||||
auto f_copy = [](const SrcData& src, DstData& dst) { dst = static_cast<DstData>(src); };
|
||||
|
||||
threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
|
||||
}
|
||||
|
||||
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
|
||||
template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
|
||||
__device__ void threadwise_4d_tensor_copy(
|
||||
SrcDesc, const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths)
|
||||
SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
|
||||
{
|
||||
auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
|
||||
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
|
||||
// optimized for scenario if p_in, p_wei, p_out are in register
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_1(InDesc,
|
||||
Float* const __restrict__ p_in,
|
||||
TInWei* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
Float* const __restrict__ p_wei,
|
||||
TInWei* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
Float* __restrict__ p_out)
|
||||
TOut* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -51,25 +51,8 @@ __device__ void threadwise_direct_convolution_1(InDesc,
|
||||
|
||||
const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);
|
||||
|
||||
p_out[out_index] += p_wei[wei_index] * p_in[in_index];
|
||||
|
||||
#if 0
|
||||
// if(threadIdx.x == 0)
|
||||
{
|
||||
printf("threadwise_direct_convolution: \t"
|
||||
"threadIdx.x %u\t"
|
||||
"out_index %u, p_out[out_index] %f, \t"
|
||||
"wei_index %u, p_wei[wei_index] %f, \t"
|
||||
"in_index %u, p_in[in_index] %f\n",
|
||||
threadIdx.x,
|
||||
out_index,
|
||||
p_out[out_index],
|
||||
wei_index,
|
||||
p_wei[wei_index],
|
||||
in_index,
|
||||
p_in[in_index]);
|
||||
}
|
||||
#endif
|
||||
fused_multiply_accumulate(
|
||||
p_out[out_index], p_wei[wei_index], p_in[in_index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -81,13 +64,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
|
||||
|
||||
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
|
||||
// Copy in and wei into register before doing convolution
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_2(InDesc,
|
||||
Float* const __restrict__ p_in,
|
||||
TInWei* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
Float* const __restrict__ p_wei,
|
||||
TInWei* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
Float* __restrict__ p_out)
|
||||
TOut* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
@@ -97,8 +80,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
|
||||
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
|
||||
|
||||
// register
|
||||
Float p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
TInWei p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
TInWei p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
|
||||
// copy input tensor into register
|
||||
threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths());
|
||||
@@ -114,13 +97,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
|
||||
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
|
||||
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
|
||||
// load 1x1 weight into register, and do 1x1 convolution in register.
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class Data, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_3(InDesc,
|
||||
Float* const __restrict__ p_in,
|
||||
Data* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
Float* const __restrict__ p_wei,
|
||||
Data* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
Float* __restrict__ p_out)
|
||||
Data* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -139,8 +122,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
|
||||
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
|
||||
|
||||
Float p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
Data p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
Data p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
|
||||
constexpr unsigned in_w_new_read = 1;
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
|
||||
SrcOpLengths,
|
||||
Number<DataPerRead>)
|
||||
{
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::type;
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
|
||||
|
||||
static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 &&
|
||||
SrcOpLengths::nDim == 6,
|
||||
@@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc,
|
||||
SrcOpLengths,
|
||||
Number<DataPerRead>)
|
||||
{
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::type;
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
|
||||
|
||||
static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 &&
|
||||
SrcOpLengths::nDim == 8,
|
||||
|
||||
Reference in New Issue
Block a user