fixed a bug, and refactored

[ROCm/composable_kernel commit: 1de6fd0753]
This commit is contained in:
Chao Liu
2019-01-24 21:20:29 -06:00
parent 29e74d62c3
commit 41a0b0b4bb
5 changed files with 54 additions and 54 deletions

View File

@@ -8,7 +8,7 @@
#include "conv_common.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
//#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
@@ -361,7 +361,7 @@ int main()
constexpr unsigned K = 1;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
#elif 1
// 3x3, 34x34
constexpr unsigned N = 64;
constexpr unsigned C = 256;
@@ -370,7 +370,7 @@ int main()
constexpr unsigned K = 64;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 1
#elif 0
// 3x3, 54x54
constexpr unsigned N = 64;
constexpr unsigned C = 64;
@@ -388,7 +388,7 @@ int main()
constexpr unsigned K = 64;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 1
#elif 0
// 3x3, 58x58
constexpr unsigned N = 64;
constexpr unsigned C = 64;
@@ -449,7 +449,7 @@ int main()
device_direct_convolution_2
#elif 0
device_implicit_gemm_convolution_1_nchw_kcsr
#elif 0
#elif 1
device_implicit_gemm_convolution_1_nchw_srck_nkhw
#elif 1
device_implicit_gemm_convolution_1_chwn_csrk_khwn

View File

@@ -1,9 +1,15 @@
#pragma once
#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include <unistd.h>
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_1_nchw_kcsr(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc,
const Tensor<T>& in,
WeiDesc,
const Tensor<T>& wei,
OutDesc,
Tensor<T>& out,
unsigned nrepeat)
{
std::size_t data_sz = sizeof(T);
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
@@ -75,40 +81,45 @@ void device_implicit_gemm_convolution_1_nchw_kcsr(
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
cudaEvent_t start, stop;
float elapsedTime;
for(unsigned i = 0; i < nrepeat; ++i)
{
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
cudaEventCreate(&start);
cudaEventRecord(start, 0);
gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize,
BlockSize,
T,
InDesc,
WeiDesc,
OutDesc,
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
KPerThread,
CPerThread,
HoPerThread,
WoPerThread>
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
OutDesc{},
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
gridwise_implicit_gemm_convolution_1_nchw_kcsr<GridSize,
BlockSize,
T,
InDesc,
WeiDesc,
OutDesc,
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
KPerThread,
CPerThread,
HoPerThread,
WoPerThread>
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
OutDesc{},
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(10000);
}
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data());

View File

@@ -65,20 +65,8 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
constexpr unsigned WoPerThread = 2;
constexpr unsigned BlockSize = 16;
#elif 0
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned KPerThread = 8;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 4;
constexpr unsigned BlockSize = 128;
#elif 1
// for 3x3, 34x34
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;
@@ -92,6 +80,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
constexpr unsigned BlockSize = 128;
#elif 0
// for 3x3, 34x34
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;