conv: update tensorDesc calculation

[ROCm/composable_kernel commit: 6a45afba95]
This commit is contained in:
Chao Liu
2018-11-04 04:55:37 -06:00
parent 621850a671
commit 1545a3f6f9
2 changed files with 125 additions and 55 deletions

View File

@@ -111,9 +111,10 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
f_par(std::thread::hardware_concurrency());
}
template <class T, class InDesc, class WeiDesc, class OutDesc>
#if 0
template <class T>
void device_convolution(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
{
DeviceTensorDescriptor<4> in_desc_device(in.mDesc);
DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc);
@@ -144,7 +145,7 @@ void device_convolution(
dim3 block_dim(64, 1, 1);
dim3 grid_dim(1, 1, 1);
#if 0
gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
<<<grid_dim, block_dim>>>(in_desc_device,
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
@@ -152,19 +153,78 @@ void device_convolution(
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
out_desc_device,
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data());
}
#else
gridwise_convolution<T, InDesc, WeiDesc, OutDesc, 4, 4, 2, 2, 1, 1, 8, 8, 1>
template <class T, class InDesc, class WeiDesc, class OutDesc>
void const_device_convolution(
InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
{
std::size_t data_sz = sizeof(T);
DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
int num_thread = std::thread::hardware_concurrency();
out.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
out_device_buf.ToDevice(out.mData.data());
dim3 block_dim(64, 1, 1);
dim3 grid_dim(1, 1, 1);
constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{};
constexpr auto I2 = Index<2>{};
constexpr auto I3 = Index<3>{};
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned NPerBlock = 1;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlockLoop = 1;
constexpr unsigned OutTileSizeH = 2;
constexpr unsigned OutTileSizeW = 2;
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned NBlockCopyLen0 = 1;
constexpr unsigned NBlockCopyLen1 = 1;
constexpr unsigned NBlockCopyLen2 = 1;
constexpr unsigned NBlockCopyLen3 = 64;
gridwise_convolution<T,
InDesc,
WeiDesc,
OutDesc,
NPerBlock,
KPerBlock,
CPerBlockLoop,
OutTileSizeH,
OutTileSizeW,
YPerBlock,
XPerBlock,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3>
<<<grid_dim, block_dim>>>(InDesc{},
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
WeiDesc{},
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
OutDesc{},
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
#endif
checkCudaErrors(cudaGetLastError());
out_device_buf.FromDevice(out.mData.data());
}
#endif
int main()
{
@@ -176,14 +236,22 @@ int main()
constexpr unsigned K = 1;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
#elif 1
constexpr unsigned N = 1;
constexpr unsigned C = 1;
constexpr unsigned HI = 130;
constexpr unsigned WI = 130;
constexpr unsigned HI = 36;
constexpr unsigned WI = 36;
constexpr unsigned K = 1;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
constexpr unsigned N = 1;
constexpr unsigned C = 1;
constexpr unsigned HI = 130;
constexpr unsigned WI = 130;
constexpr unsigned K = 1;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
#elif 0
constexpr unsigned N = 3;
constexpr unsigned C = 16;
@@ -214,7 +282,12 @@ int main()
wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
host_convolution(in, wei, out_host);
device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#if 0
device_convolution(in, wei, out_device);
#else
const_device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#endif
std::cout << __func__ << ": done" << std::endl;

View File

@@ -231,17 +231,8 @@ template <class TFloat,
class InDesc,
class WeiDesc,
class OutDesc,
unsigned S,
unsigned R,
unsigned InTileSizeH,
unsigned InTileSizeW,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned YPerBlock,
unsigned XPerBlock,
unsigned CPerBlockLoop>
unsigned OutTileSizeW>
__device__ void blockwise_convolution(InDesc,
TFloat* const __restrict__ p_in,
WeiDesc,
@@ -258,6 +249,19 @@ __device__ void blockwise_convolution(InDesc,
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr unsigned S = wei_desc.GetLength(I2);
constexpr unsigned R = wei_desc.GetLength(I3);
constexpr unsigned NPerBlock = out_desc.GetLength(I0);
constexpr unsigned KPerBlock = out_desc.GetLength(I1);
constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
constexpr unsigned CPerBlockLoop = in_desc.GetLength(I1);
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
#if 1
if(threadIdx.x == 0)
{
@@ -383,15 +387,17 @@ template <class TFloat,
class InDesc,
class WeiDesc,
class OutDesc,
unsigned InTileSizeH,
unsigned InTileSizeW,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlockLoop,
unsigned OutTileSizeH,
unsigned OutTileSizeW,
unsigned YPerBlock,
unsigned XPerBlock,
unsigned CPerBlockLoop>
unsigned NBlockCopyLen0,
unsigned NBlockCopyLen1,
unsigned NBlockCopyLen2,
unsigned NBlockCopyLen3>
__global__ void gridwise_convolution(InDesc,
TFloat* const __restrict__ p_in,
WeiDesc,
@@ -420,11 +426,10 @@ __global__ void gridwise_convolution(InDesc,
}
#endif
constexpr unsigned NBlockWork = (in_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned YBlockWork = (in_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock;
constexpr unsigned XBlockWork = (in_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock;
constexpr unsigned KBlockWork = (wei_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned NBlockWork = (out_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
constexpr unsigned KBlockWork = (out_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
constexpr unsigned YBlockWork = (out_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock;
constexpr unsigned XBlockWork = (out_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock;
const unsigned block_id =
blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * (gridDim.y * gridDim.x);
@@ -434,6 +439,7 @@ __global__ void gridwise_convolution(InDesc,
CPerBlockLoop,
YPerBlock * OutTileSizeH + S - 1,
XPerBlock * OutTileSizeW + R - 1>{});
constexpr auto wei_block_desc =
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlockLoop, S, R>{});
@@ -474,10 +480,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat,
decltype(in_desc),
decltype(in_block_desc),
1,
1,
1,
64,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy)>(in_desc,
p_in + in_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
@@ -491,10 +497,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat,
decltype(wei_desc),
decltype(wei_block_desc),
1,
1,
1,
64,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy)>(
wei_desc,
p_wei + wei_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
@@ -506,10 +512,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat,
decltype(out_desc),
decltype(out_block_desc),
1,
1,
1,
64,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy)>(out_desc,
p_out + out_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
@@ -526,17 +532,8 @@ __global__ void gridwise_convolution(InDesc,
decltype(in_block_desc),
decltype(wei_block_desc),
decltype(out_block_desc),
S,
R,
InTileSizeH,
InTileSizeW,
OutTileSizeH,
OutTileSizeW,
NPerBlock,
KPerBlock,
YPerBlock,
XPerBlock,
CPerBlockLoop>(
OutTileSizeW>(
in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
__syncthreads();
@@ -545,10 +542,10 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op<TFloat,
decltype(out_block_desc),
decltype(out_desc),
1,
1,
1,
64,
NBlockCopyLen0,
NBlockCopyLen1,
NBlockCopyLen2,
NBlockCopyLen3,
decltype(f_copy)>(out_block_desc,
p_out_block,
out_desc,