mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 03:19:48 +00:00
@@ -2,13 +2,13 @@
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
|
||||
// optimized for scenario if p_in, p_wei, p_out are in register
|
||||
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_1(InDesc,
|
||||
TFloat* const __restrict__ p_in,
|
||||
Float* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
TFloat* const __restrict__ p_wei,
|
||||
Float* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
TFloat* __restrict__ p_out)
|
||||
Float* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
|
||||
|
||||
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
|
||||
// Copy in and wei into register before doing convolution
|
||||
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_2(InDesc,
|
||||
TFloat* const __restrict__ p_in,
|
||||
Float* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
TFloat* const __restrict__ p_wei,
|
||||
Float* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
TFloat* __restrict__ p_out)
|
||||
Float* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
|
||||
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
|
||||
|
||||
// register
|
||||
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
Float p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
|
||||
// copy input tensor into register
|
||||
threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
|
||||
@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
|
||||
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
|
||||
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
|
||||
// load 1x1 weight into register, and do 1x1 convolution in register.
|
||||
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_3(InDesc,
|
||||
TFloat* const __restrict__ p_in,
|
||||
Float* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
TFloat* const __restrict__ p_wei,
|
||||
Float* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
TFloat* __restrict__ p_out)
|
||||
Float* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
|
||||
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
|
||||
|
||||
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
Float p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
|
||||
constexpr unsigned in_w_new_read = 1;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user