hip build

This commit is contained in:
Chao Liu
2019-02-15 00:54:30 -06:00
parent 121693b3d3
commit 67c6f73ffe
35 changed files with 454 additions and 394 deletions

View File

@@ -245,10 +245,11 @@ struct BlockwiseChwnTensorCopyPadded
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
const Float* p_src_tmp =
p_src + src_desc.Get1dIndex(c_block_data_begin,
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
n_block_data_begin);
p_src +
src_desc.Get1dIndex(c_block_data_begin,
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
n_block_data_begin);
#if 0
if(get_thread_local_1d_id() == 0)

View File

@@ -95,10 +95,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
Float p_out_thread[out_thread_desc.GetElementSpace()];
threadwise_4d_tensor_copy(out_block_desc,
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin),
p_out_block +
out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin),
out_thread_desc,
p_out_thread,
out_thread_desc.GetLengths());
@@ -109,10 +110,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
// threadwise convolution
threadwise_direct_convolution_2(
in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data_begin,
hi_thread_data_begin,
wi_thread_data_begin),
p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data_begin,
hi_thread_data_begin,
wi_thread_data_begin),
wei_thread_block_desc,
p_wei_block +
wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
@@ -124,10 +126,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
threadwise_4d_tensor_copy(out_thread_desc,
p_out_thread,
out_block_desc,
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin),
p_out_block +
out_block_desc.Get1dIndex(n_thread_data_begin,
k_thread_data_begin,
ho_thread_data_begin,
wo_thread_data_begin),
out_thread_desc.GetLengths());
}
}

View File

@@ -305,8 +305,9 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned NClusterWork =
(NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
static_assert(BlockSize == (MClusterWork * MThreadPerCluster) *
(NClusterWork * NThreadPerCluster),
static_assert(BlockSize ==
(MClusterWork * MThreadPerCluster) *
(NClusterWork * NThreadPerCluster),
"wrong! wrong BlockSize");
if(DistributeThreadAlongColumnFirst)
@@ -685,7 +686,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
// preload A, B
// preload A, B
#pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ // copy A-sub to form A
@@ -718,7 +719,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatA* p_a_thread_next = even_loop ? p_a_thread_1 : p_a_thread_0;
FloatB* p_b_thread_next = even_loop ? p_b_thread_1 : p_b_thread_0;
// preload next A, B
// preload next A, B
#pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ // copy A-sub to form A
@@ -906,8 +907,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
c_thread_sub_mtx,
False,
p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
n_repeat * NPerThreadSubC),
p_c_thread +
c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
n_repeat * NPerThreadSubC),
f_accum);
}
}

View File

@@ -62,4 +62,4 @@ struct Sequence
printf("Sequence::ReorderByPutOldToNew not implemented");
assert(false);
}
};
};

11
src/include/config.h.in Normal file
View File

@@ -0,0 +1,11 @@
#pragma once
#cmakedefine01 DEVICE_BACKEND_HIP
#cmakedefine01 DEVICE_BACKEND_CUDA
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#endif

View File

@@ -1,5 +1,6 @@
#pragma once
#include <memory>
#include "config.h"
struct DeviceMem
{
@@ -27,4 +28,31 @@ struct KernelTimer
std::unique_ptr<KernelTimerImpl> impl;
};
void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time);
template <typename... Args, typename F>
float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args)
{
KernelTimer timer;
#if DEVICE_BACKEND_HIP
timer.Start();
hipLaunchKernelGGL(kernel, grid_dim, block_dim, 0, 0, args...);
timer.End();
hipGetErrorString(hipGetLastError());
#elif DEVICE_BACKEND_CUDA
const void* f = reinterpret_cast<const void*>(kernel);
void* p_args = {&args...};
timer.Start();
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, 0, 0);
timer.End();
checkCudaErrors(error);
#endif
return timer.GetElapsedTime();
}

View File

@@ -1,4 +1,5 @@
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
@@ -146,10 +147,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin += CPerBlock)
{
// copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
hi_block_work_begin,
wi_block_work_begin),
blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin,
hi_block_work_begin,
wi_block_work_begin),
p_in_block);
// copy weight tensor to LDS
@@ -176,9 +178,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
}
// copy output tensor from LDS to device mem
blockwise_out_copy.Run(p_out_block,
p_out_global + out_global_desc.Get1dIndex(n_block_work_begin,
k_block_work_begin,
ho_block_work_begin,
wo_block_work_begin));
blockwise_out_copy.Run(
p_out_block,
p_out_global +
out_global_desc.Get1dIndex(
n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
}

View File

@@ -1,4 +1,5 @@
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
@@ -162,10 +163,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
c_block_data_begin += CPerBlock, __syncthreads())
{
// copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
p_in_block);
// copy weight tensor to LDS
@@ -177,14 +179,15 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
{
// threadwise convolution
// threadwise convolution
#if 1
threadwise_direct_convolution_2(
in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
wei_thread_block_desc,
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_thread_desc,
@@ -192,10 +195,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#elif 0
threadwise_direct_convolution_3(
in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
hi_thread_data_begin,
wi_thread_data_begin),
wei_thread_block_desc,
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_thread_desc,
@@ -209,9 +213,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
out_thread_desc,
p_out_thread,
out_global_desc,
p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
p_out_global +
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_thread_desc.GetLengths());
}

View File

@@ -121,8 +121,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
decltype(in_chwn_block_desc),
decltype(in_chwn_block_desc.GetLengths())>{};
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
@@ -199,8 +199,9 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
const Float* p_in_global_block_begin =
p_in_global + in_chwn_global_desc.Get1dIndex(
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
p_in_global +
in_chwn_global_desc.Get1dIndex(
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
const Float* p_wei_global_block_begin =
p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
@@ -257,10 +258,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
out_hkwn_thread_desc,
p_out_thread,
out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
p_out_global +
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_khwn_from_hkwn);
}

View File

@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
out_hkwn_thread_desc,
p_out_thread,
out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
p_out_global +
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_khwn_from_hkwn);
}

View File

@@ -256,7 +256,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0;
Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0;
// preload next data
// preload next data
#if 1
// input: global mem to LDS,
blockwise_in_copy.Run(p_in_global,
@@ -339,10 +339,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
out_hkwn_thread_desc,
p_out_thread,
out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
p_out_global +
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_khwn_from_hkwn);
}

View File

@@ -160,10 +160,11 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
in_nchw_global_desc,
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
p_in_global +
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
in_chwn_block_desc,
p_in_block,
in_nchw_block_desc.GetLengths(),
@@ -244,10 +245,11 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_hkwn_thread_desc,
p_out_thread,
out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
p_out_global +
out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_nkhw_from_hkwn);
#else
@@ -261,10 +263,11 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_nkhw_thread_desc,
p_out_thread,
out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
p_out_global +
out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_nkhw_thread_desc.GetLengths());
#endif
}

View File

@@ -166,10 +166,11 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
in_nchw_global_desc,
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
p_in_global +
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin,
hi_block_data_begin,
wi_block_data_begin),
in_chwn_block_desc,
p_in_block,
in_nchw_block_desc.GetLengths(),
@@ -179,9 +180,10 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
#if 1
// weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
blockwise_wei_copy.Run(p_wei_global + wei_srck_global_desc.Get1dIndex(
0, 0, c_block_data_begin, k_block_data_begin),
p_wei_block);
blockwise_wei_copy.Run(
p_wei_global +
wei_srck_global_desc.Get1dIndex(0, 0, c_block_data_begin, k_block_data_begin),
p_wei_block);
#endif
__syncthreads();
@@ -217,10 +219,11 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
out_hkwn_thread_desc,
p_out_thread,
out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
p_out_global +
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_nkhw_from_hkwn);
}

View File

@@ -111,8 +111,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{};
#elif 1
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
@@ -137,8 +137,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
InBlockCopyDataPerRead>{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,

View File

@@ -111,8 +111,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{};
#elif 1
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
@@ -137,8 +137,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
InBlockCopyDataPerRead>{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
@@ -258,7 +258,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
__syncthreads();
// load next data
// load next data
#if 0
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);

View File

@@ -103,8 +103,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(const Float* const __restric
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,

View File

@@ -103,8 +103,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
}
#endif
// in: global mem to LDS
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// in: global mem to LDS
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 1
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
@@ -129,8 +129,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
decltype(in_cb_block_desc.GetLengths())>{};
#endif
// weight: global mem to LDS,
// format is [S,R,CPerBlock,KPerBlock]
// weight: global mem to LDS,
// format is [S,R,CPerBlock,KPerBlock]
#if 1
const auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize,
@@ -191,7 +191,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
// prelog: load data
// prelog: load data
#if 1
// input: global mem to LDS,
blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin),
@@ -220,9 +220,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#if 1
// preload next data
// input: global mem to LDS,
blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex(
c_block_data_begin + CPerBlock, b_block_data_begin),
p_in_block_next);
blockwise_in_copy.Run(
p_in_global +
in_cb_global_desc.Get1dIndex(c_block_data_begin + CPerBlock, b_block_data_begin),
p_in_block_next);
#endif
#if 1

View File

@@ -189,17 +189,18 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_
S,
R,
OutTileSizeH,
OutTileSizeW>(in_transform_thread_block_desc,
p_in_transform_block + in_transform_block_desc.Get1dIndex(
n_thread_data_begin,
c_thread_data,
y_thread_data_begin * InTileSizeH,
x_thread_data_begin * InTileSizeW),
wei_transform_thread_block_desc,
p_wei_transform_block + wei_transform_block_desc.Get1dIndex(
k_thread_data_begin, c_thread_data, 0, 0),
out_transform_thread_desc,
p_out_transform_thread);
OutTileSizeW>(
in_transform_thread_block_desc,
p_in_transform_block +
in_transform_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data,
y_thread_data_begin * InTileSizeH,
x_thread_data_begin * InTileSizeW),
wei_transform_thread_block_desc,
p_wei_transform_block +
wei_transform_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_transform_thread_desc,
p_out_transform_thread);
}
};

View File

@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
return os;
}
typedef enum
{
typedef enum {
Half = 0,
Float = 1,
} DataType_t;