mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
@@ -178,7 +178,7 @@ int main()
|
||||
|
||||
for(int i = 0; i < 20; ++i)
|
||||
{
|
||||
device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
|
||||
device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
@@ -55,23 +55,23 @@ void device_direct_convolution_1(
|
||||
cudaEventCreate(&start);
|
||||
cudaEventRecord(start, 0);
|
||||
|
||||
gridwise_convolution<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NBlockOpLen0,
|
||||
NBlockOpLen1,
|
||||
NBlockOpLen2,
|
||||
NBlockOpLen3,
|
||||
BlockSize,
|
||||
GridSize>
|
||||
gridwise_direct_convolution_1<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NBlockOpLen0,
|
||||
NBlockOpLen1,
|
||||
NBlockOpLen2,
|
||||
NBlockOpLen3,
|
||||
BlockSize,
|
||||
GridSize>
|
||||
<<<grid_dim, block_dim>>>(InDesc{},
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
WeiDesc{},
|
||||
|
||||
@@ -59,26 +59,26 @@ void device_direct_convolution_2(
|
||||
cudaEventCreate(&start);
|
||||
cudaEventRecord(start, 0);
|
||||
|
||||
gridwise_convolution<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
NBlockOpLen0,
|
||||
NBlockOpLen1,
|
||||
NBlockOpLen2,
|
||||
NBlockOpLen3,
|
||||
BlockSize,
|
||||
GridSize>
|
||||
gridwise_direct_convolution_2<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
NBlockOpLen0,
|
||||
NBlockOpLen1,
|
||||
NBlockOpLen2,
|
||||
NBlockOpLen3,
|
||||
BlockSize,
|
||||
GridSize>
|
||||
<<<grid_dim, block_dim>>>(InDesc{},
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
WeiDesc{},
|
||||
|
||||
@@ -20,12 +20,12 @@ template <class TFloat,
|
||||
unsigned NBlockOpLen3,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_convolution(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
__global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Index<0>{};
|
||||
constexpr auto I1 = Index<1>{};
|
||||
|
||||
@@ -25,12 +25,12 @@ template <class TFloat,
|
||||
unsigned NBlockOpLen3,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_convolution(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
__global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Index<0>{};
|
||||
constexpr auto I1 = Index<1>{};
|
||||
|
||||
Reference in New Issue
Block a user