mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
@@ -4,7 +4,7 @@
|
||||
#include "threadwise_direct_convolution.cuh"
|
||||
|
||||
template <unsigned BlockSize,
|
||||
class TFloat,
|
||||
class Float,
|
||||
class InBlockDesc,
|
||||
class WeiBlockDesc,
|
||||
class OutBlockDesc,
|
||||
@@ -14,11 +14,11 @@ template <unsigned BlockSize,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread>
|
||||
__device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
TFloat* const __restrict__ p_in_block,
|
||||
Float* const __restrict__ p_in_block,
|
||||
WeiBlockDesc,
|
||||
TFloat* const __restrict__ p_wei_block,
|
||||
Float* const __restrict__ p_wei_block,
|
||||
OutBlockDesc,
|
||||
TFloat* __restrict__ p_out_block)
|
||||
Float* __restrict__ p_out_block)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -92,7 +92,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
|
||||
unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding
|
||||
|
||||
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
Float p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
threadwise_4d_tensor_copy(out_block_desc,
|
||||
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
|
||||
template <unsigned BlockSize, class TFloat, class DstDesc, class F>
|
||||
template <unsigned BlockSize, class Float, class DstDesc, class F>
|
||||
__device__ void
|
||||
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
|
||||
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -79,7 +79,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
|
||||
// TODO: in order to optimize mem access for different mem type,
|
||||
// need to write specialized version
|
||||
template <unsigned BlockSize,
|
||||
class TFloat,
|
||||
class Float,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class RefDesc,
|
||||
@@ -87,9 +87,9 @@ template <unsigned BlockSize,
|
||||
class F>
|
||||
__device__ void
|
||||
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
|
||||
TFloat* const __restrict__ p_src,
|
||||
Float* const __restrict__ p_src,
|
||||
DstDesc,
|
||||
TFloat* __restrict__ p_dst,
|
||||
Float* __restrict__ p_dst,
|
||||
RefDesc,
|
||||
Reorder,
|
||||
F f)
|
||||
@@ -170,36 +170,32 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned BlockSize, class TFloat, class DstDesc>
|
||||
__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
|
||||
template <unsigned BlockSize, class Float, class DstDesc>
|
||||
__device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
|
||||
{
|
||||
auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
|
||||
auto f_set_zero = [](Float& v) { v = Float(0); };
|
||||
|
||||
blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
|
||||
}
|
||||
|
||||
template <unsigned BlockSize,
|
||||
class TFloat,
|
||||
class Float,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class RefDesc,
|
||||
class Reorder>
|
||||
__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
|
||||
TFloat* const __restrict__ p_src,
|
||||
DstDesc,
|
||||
TFloat* __restrict__ p_dst,
|
||||
RefDesc,
|
||||
Reorder)
|
||||
__device__ void blockwise_4d_tensor_copy_reorder(
|
||||
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
|
||||
{
|
||||
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
|
||||
auto f_copy = [](const Float& src, Float& dst) { dst = src; };
|
||||
|
||||
blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
|
||||
}
|
||||
|
||||
template <unsigned BlockSize, class TFloat, class SrcDesc, class DstDesc, class RefDesc>
|
||||
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class RefDesc>
|
||||
__device__ void blockwise_4d_tensor_copy(
|
||||
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
|
||||
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
|
||||
{
|
||||
constexpr auto reorder = Sequence<0, 1, 2, 3>{};
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "blockwise_tensor_op.cuh"
|
||||
#include "blockwise_direct_convolution.cuh"
|
||||
|
||||
template <class TFloat,
|
||||
template <class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
@@ -20,11 +20,11 @@ template <class TFloat,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
Float* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
Float* __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -68,9 +68,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
|
||||
constexpr unsigned out_block_size = out_block_desc.GetElementSpace();
|
||||
|
||||
__shared__ TFloat p_in_block[in_block_size];
|
||||
__shared__ TFloat p_wei_block[wei_block_size];
|
||||
__shared__ TFloat p_out_block[out_block_size];
|
||||
__shared__ Float p_in_block[in_block_size];
|
||||
__shared__ Float p_wei_block[wei_block_size];
|
||||
__shared__ Float p_out_block[out_block_size];
|
||||
|
||||
const unsigned block_id = blockIdx.x;
|
||||
|
||||
@@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
|
||||
// blockwise convolution
|
||||
blockwise_direct_convolution<BlockSize,
|
||||
TFloat,
|
||||
Float,
|
||||
decltype(in_block_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(out_block_desc),
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "threadwise_tensor_op.cuh"
|
||||
#include "threadwise_direct_convolution.cuh"
|
||||
|
||||
template <class TFloat,
|
||||
template <class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
@@ -22,11 +22,11 @@ template <class TFloat,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
Float* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
Float* __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -56,8 +56,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
constexpr unsigned in_block_size = in_block_desc.GetElementSpace();
|
||||
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
|
||||
|
||||
__shared__ TFloat p_in_block[in_block_size];
|
||||
__shared__ TFloat p_wei_block[wei_block_size];
|
||||
__shared__ Float p_in_block[in_block_size];
|
||||
__shared__ Float p_wei_block[wei_block_size];
|
||||
|
||||
// threadwise tensors
|
||||
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
|
||||
@@ -73,7 +73,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
|
||||
|
||||
// register
|
||||
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
Float p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
// divide block work
|
||||
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "blockwise_winograd_transform.cuh"
|
||||
#include "threadwise_winograd_transform.cuh"
|
||||
|
||||
template <class TFloat,
|
||||
template <class Float,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
@@ -20,11 +20,11 @@ template <class TFloat,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_winograd_convolution(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
Float* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
Float* __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -102,8 +102,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
|
||||
constexpr auto wei_transform_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, InTileSizeH, InTileSizeW>{});
|
||||
|
||||
__shared__ TFloat p_in_transform_block[in_transform_block_desc.GetElementSpace()];
|
||||
__shared__ TFloat p_wei_transform_block[wei_transform_block_desc.GetElementSpace()];
|
||||
__shared__ Float p_in_transform_block[in_transform_block_desc.GetElementSpace()];
|
||||
__shared__ Float p_wei_transform_block[wei_transform_block_desc.GetElementSpace()];
|
||||
|
||||
// thread data
|
||||
constexpr auto in_transform_thread_block_desc =
|
||||
@@ -123,8 +123,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
|
||||
constexpr auto out_thread_global_desc =
|
||||
make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides());
|
||||
|
||||
TFloat p_out_transform_thread[out_transform_thread_desc.GetElementSpace()];
|
||||
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
Float p_out_transform_thread[out_transform_thread_desc.GetElementSpace()];
|
||||
Float p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
#if 0
|
||||
if(blockIdx.x == 0 && threadIdx.x == 0)
|
||||
@@ -146,7 +146,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
|
||||
{
|
||||
#if 0
|
||||
// blockwise transform input
|
||||
blockwise_winograd_transform_input<TFloat,
|
||||
blockwise_winograd_transform_input<Float,
|
||||
InTileSizeH,
|
||||
InTileSizeW,
|
||||
S,
|
||||
@@ -166,7 +166,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
|
||||
|
||||
#endif
|
||||
// blockwise transform weights
|
||||
blockwise_winograd_transform_weight<TFloat,
|
||||
blockwise_winograd_transform_weight<Float,
|
||||
InTileSizeH,
|
||||
InTileSizeW,
|
||||
S,
|
||||
@@ -183,7 +183,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
|
||||
{
|
||||
// threadwise point multiplication
|
||||
threadwise_winograd_calculate_transformed_output<
|
||||
TFloat,
|
||||
Float,
|
||||
decltype(in_transform_thread_block_desc),
|
||||
decltype(wei_transform_thread_block_desc),
|
||||
decltype(out_transform_thread_desc),
|
||||
@@ -207,7 +207,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
|
||||
};
|
||||
|
||||
// transform back
|
||||
threadwise_winograd_reverse_transform_output<TFloat,
|
||||
threadwise_winograd_reverse_transform_output<Float,
|
||||
decltype(out_transform_thread_desc),
|
||||
decltype(out_thread_desc),
|
||||
InTileSizeH,
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
|
||||
// optimized for scenario if p_in, p_wei, p_out are in register
|
||||
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_1(InDesc,
|
||||
TFloat* const __restrict__ p_in,
|
||||
Float* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
TFloat* const __restrict__ p_wei,
|
||||
Float* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
TFloat* __restrict__ p_out)
|
||||
Float* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
|
||||
|
||||
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
|
||||
// Copy in and wei into register before doing convolution
|
||||
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_2(InDesc,
|
||||
TFloat* const __restrict__ p_in,
|
||||
Float* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
TFloat* const __restrict__ p_wei,
|
||||
Float* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
TFloat* __restrict__ p_out)
|
||||
Float* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
|
||||
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
|
||||
|
||||
// register
|
||||
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
Float p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
|
||||
// copy input tensor into register
|
||||
threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
|
||||
@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
|
||||
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
|
||||
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
|
||||
// load 1x1 weight into register, and do 1x1 convolution in register.
|
||||
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_3(InDesc,
|
||||
TFloat* const __restrict__ p_in,
|
||||
Float* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
TFloat* const __restrict__ p_wei,
|
||||
Float* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
TFloat* __restrict__ p_out)
|
||||
Float* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
|
||||
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
|
||||
|
||||
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
Float p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
|
||||
constexpr unsigned in_w_new_read = 1;
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
|
||||
template <class TFloat, class Desc, class F>
|
||||
__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __restrict__ p, F f)
|
||||
template <class Float, class Desc, class F>
|
||||
__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __restrict__ p, F f)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -37,12 +37,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __r
|
||||
|
||||
// TODO: in order to optimize mem access for different mem type,
|
||||
// need to write specialized version
|
||||
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, class Reorder, class F>
|
||||
template <class Float, class SrcDesc, class DstDesc, class RefDesc, class Reorder, class F>
|
||||
__device__ void
|
||||
threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
|
||||
TFloat* const __restrict__ p_src,
|
||||
Float* const __restrict__ p_src,
|
||||
DstDesc,
|
||||
TFloat* __restrict__ p_dst,
|
||||
Float* __restrict__ p_dst,
|
||||
RefDesc,
|
||||
Reorder,
|
||||
F f)
|
||||
@@ -83,26 +83,22 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
|
||||
}
|
||||
}
|
||||
|
||||
template <class TFloat, class Desc>
|
||||
__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p)
|
||||
template <class Float, class Desc>
|
||||
__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
|
||||
{
|
||||
auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
|
||||
auto f_set_zero = [](Float& v) { v = Float(0); };
|
||||
|
||||
threadwise_4d_tensor_pointwise_operation_unary<TFloat, Desc, decltype(f_set_zero)>(
|
||||
threadwise_4d_tensor_pointwise_operation_unary<Float, Desc, decltype(f_set_zero)>(
|
||||
Desc{}, p, f_set_zero);
|
||||
}
|
||||
|
||||
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, class Reorder>
|
||||
__device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
|
||||
TFloat* const __restrict__ p_src,
|
||||
DstDesc,
|
||||
TFloat* __restrict__ p_dst,
|
||||
RefDesc,
|
||||
Reorder)
|
||||
template <class Float, class SrcDesc, class DstDesc, class RefDesc, class Reorder>
|
||||
__device__ void threadwise_4d_tensor_copy_reorder(
|
||||
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
|
||||
{
|
||||
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
|
||||
auto f_copy = [](const Float& src, Float& dst) { dst = src; };
|
||||
|
||||
threadwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
|
||||
threadwise_4d_tensor_pointwise_operation_binary_reorder<Float,
|
||||
SrcDesc,
|
||||
DstDesc,
|
||||
RefDesc,
|
||||
@@ -111,18 +107,18 @@ __device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
|
||||
}
|
||||
|
||||
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc>
|
||||
template <class Float, class SrcDesc, class DstDesc, class RefDesc>
|
||||
__device__ void threadwise_4d_tensor_copy(
|
||||
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
|
||||
SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
|
||||
{
|
||||
auto reorder = Sequence<0, 1, 2, 3>{};
|
||||
|
||||
threadwise_4d_tensor_copy_reorder<TFloat, SrcDesc, DstDesc, RefDesc, decltype(reorder)>(
|
||||
threadwise_4d_tensor_copy_reorder<Float, SrcDesc, DstDesc, RefDesc, decltype(reorder)>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
|
||||
}
|
||||
|
||||
template <class TFloat, class Desc, class IDim, class NShift>
|
||||
__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, NShift)
|
||||
template <class Float, class Desc, class IDim, class NShift>
|
||||
__device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
Reference in New Issue
Block a user