mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
fixed a bug, and refactored
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
#include "conv_common.cuh"
|
||||
#include "device_direct_convolution_1.cuh"
|
||||
#include "device_direct_convolution_2.cuh"
|
||||
//#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
|
||||
#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
|
||||
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
|
||||
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
|
||||
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
|
||||
@@ -361,7 +361,7 @@ int main()
|
||||
constexpr unsigned K = 1;
|
||||
constexpr unsigned S = 3;
|
||||
constexpr unsigned R = 3;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 3x3, 34x34
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 256;
|
||||
@@ -370,7 +370,7 @@ int main()
|
||||
constexpr unsigned K = 64;
|
||||
constexpr unsigned S = 3;
|
||||
constexpr unsigned R = 3;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 3x3, 54x54
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 64;
|
||||
@@ -388,7 +388,7 @@ int main()
|
||||
constexpr unsigned K = 64;
|
||||
constexpr unsigned S = 3;
|
||||
constexpr unsigned R = 3;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 3x3, 58x58
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 64;
|
||||
@@ -449,7 +449,7 @@ int main()
|
||||
device_direct_convolution_2
|
||||
#elif 0
|
||||
device_implicit_gemm_convolution_1_nchw_kcsr
|
||||
#elif 0
|
||||
#elif 1
|
||||
device_implicit_gemm_convolution_1_nchw_srck_nkhw
|
||||
#elif 1
|
||||
device_implicit_gemm_convolution_1_chwn_csrk_khwn
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
#pragma once
|
||||
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh"
|
||||
#include <unistd.h>
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_implicit_gemm_convolution_1_nchw_kcsr(
|
||||
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
|
||||
void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
|
||||
const Tensor<T>& in,
|
||||
WeiDesc,
|
||||
const Tensor<T>& wei,
|
||||
OutDesc,
|
||||
Tensor<T>& out,
|
||||
unsigned nrepeat)
|
||||
{
|
||||
std::size_t data_sz = sizeof(T);
|
||||
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
|
||||
@@ -75,40 +81,45 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
float elapsedTime;
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
float elapsedTime;
|
||||
|
||||
cudaEventCreate(&start);
|
||||
cudaEventRecord(start, 0);
|
||||
cudaEventCreate(&start);
|
||||
cudaEventRecord(start, 0);
|
||||
|
||||
gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread>
|
||||
<<<grid_dim, block_dim>>>(InDesc{},
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
WeiDesc{},
|
||||
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
|
||||
OutDesc{},
|
||||
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
|
||||
gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread>
|
||||
<<<grid_dim, block_dim>>>(InDesc{},
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
WeiDesc{},
|
||||
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
|
||||
OutDesc{},
|
||||
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
|
||||
|
||||
cudaEventCreate(&stop);
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaEventSynchronize(stop);
|
||||
cudaEventCreate(&stop);
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
cudaEventElapsedTime(&elapsedTime, start, stop);
|
||||
printf("Elapsed time : %f ms\n", elapsedTime);
|
||||
cudaEventElapsedTime(&elapsedTime, start, stop);
|
||||
printf("Elapsed time : %f ms\n", elapsedTime);
|
||||
|
||||
usleep(10000);
|
||||
}
|
||||
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
out_device_buf.FromDevice(out.mData.data());
|
||||
|
||||
@@ -65,20 +65,8 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
|
||||
constexpr unsigned WoPerThread = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 16;
|
||||
#elif 0
|
||||
constexpr unsigned NPerBlock = 1;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
constexpr unsigned HoPerBlock = 4;
|
||||
constexpr unsigned WoPerBlock = 32;
|
||||
|
||||
constexpr unsigned KPerThread = 8;
|
||||
constexpr unsigned CPerThread = 1;
|
||||
constexpr unsigned HoPerThread = 2;
|
||||
constexpr unsigned WoPerThread = 4;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 1
|
||||
// for 3x3, 34x34
|
||||
constexpr unsigned NPerBlock = 1;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
@@ -92,6 +80,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
// for 3x3, 34x34
|
||||
constexpr unsigned NPerBlock = 2;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
|
||||
@@ -104,7 +104,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
|
||||
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
|
||||
const unsigned wo_block_data_begin = w_block_work_id * HoPerBlock;
|
||||
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
|
||||
|
||||
const unsigned hi_block_data_begin = ho_block_data_begin;
|
||||
const unsigned wi_block_data_begin = wo_block_data_begin;
|
||||
|
||||
@@ -70,7 +70,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
|
||||
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
|
||||
const unsigned wo_block_data_begin = w_block_work_id * HoPerBlock;
|
||||
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
|
||||
|
||||
const unsigned hi_block_data_begin = ho_block_data_begin;
|
||||
const unsigned wi_block_data_begin = wo_block_data_begin;
|
||||
@@ -162,7 +162,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_nchw_global_desc.GetLength(I1);
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
// input: global mem to LDS,
|
||||
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
|
||||
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
|
||||
@@ -177,7 +177,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
|
||||
reorder_chwn_from_nchw);
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
// weight: global mem to LDS,
|
||||
// format is [S,R,C,K], no conversion needed
|
||||
blockwise_wei_copy.run(p_wei_global + wei_srck_global_desc.Get1dIndex(
|
||||
|
||||
Reference in New Issue
Block a user