mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
v5r1 fusion kernels for inference (#49)
* init
* refactor for 1x1
* rename e0_e1
* add e1 with bugs
* debug
* fixed
* fixed e1
* add timer
* imprve threadwise gemm with dot2
* add e2
* tuning
* seperate c2
* add nhwc
* restore nchwc
* clean
* opt
* fixed; tuning
* add BGlobalMoveSliceWindowStepHacks{}
* tuning
* repeat running
* adjust
* merge v5r1 nchwc
* add adaptors
* split k0 k1 in c_thread_grid
* split h and w
* remove v5r1 nhwc
* clean for pr
* remove host_conv_add
* clean code
* clean
* add dynamic support
* static mode
* test static
* add conv+add fusion
* fixed validation
* naming fix
* use activ_enum
* make static
* refactor conv_add for InMem::add
* add bias
* add conv_out
* add configurable makeddesc
* add maxpool fusion
* add maxpool host for validation
* enable static desc
* conv-only use v5r1_add
* test
* test
* for binary dumps
* fixed incorrect results due to typo
* clean
* debugging maxpool
* workaround with offset trick
* clean code
* modularize ops of fusion
* add gridwise_gemm_v3
* create seperate fusion fun
* enable dynamic mode of conv and conv+resize_add
* add dynamic mode of maxpool
* add pass by point
* add activ_type as arguments
* merge develop
* clean
* reset config to old default
Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
@@ -10,99 +10,99 @@ template <index_t BlockSize,
|
||||
typename FloatA,
|
||||
typename FloatB,
|
||||
typename FloatC,
|
||||
typename BlockMatrixA,
|
||||
typename BlockMatrixB,
|
||||
typename ThreadMatrixC,
|
||||
index_t KPerThread,
|
||||
index_t HPerThread,
|
||||
index_t WPerThread,
|
||||
typename ABlockDesc_E1_K1_E2,
|
||||
typename BBlockDesc_E1_N_Ho_Wo_E2,
|
||||
typename CThreadDesc_K_N_Ho_Wo,
|
||||
index_t EPerThreadLoop,
|
||||
index_t ThreadGemmADataPerRead_K,
|
||||
index_t ThreadGemmBDataPerRead_W>
|
||||
index_t KPerThreadLoop>
|
||||
struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
|
||||
{
|
||||
struct MatrixIndex
|
||||
{
|
||||
index_t k;
|
||||
index_t h;
|
||||
index_t w;
|
||||
};
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
static constexpr auto I3 = Number<3>{};
|
||||
static constexpr auto I4 = Number<4>{};
|
||||
|
||||
// HACK: fix this @Jing Zhang
|
||||
static constexpr index_t KPerThreadSubC = 4;
|
||||
using AIndex = MultiIndex<3>;
|
||||
using BIndex = MultiIndex<3>;
|
||||
using CIndex = MultiIndex<4>;
|
||||
|
||||
static constexpr auto E1 = ABlockDesc_E1_K1_E2{}.GetLength(I0);
|
||||
static constexpr auto KPerBlock = ABlockDesc_E1_K1_E2{}.GetLength(I1);
|
||||
static constexpr auto E2 = ABlockDesc_E1_K1_E2{}.GetLength(I2);
|
||||
|
||||
static constexpr auto HoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
|
||||
static constexpr auto WoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
|
||||
|
||||
static constexpr auto KPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0);
|
||||
static constexpr auto HoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2);
|
||||
static constexpr auto WoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3);
|
||||
|
||||
static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
|
||||
make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadLoop>{}, Number<E2>{}));
|
||||
|
||||
static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
|
||||
Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
|
||||
static constexpr auto b_thread_mtx_ =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<EPerThreadLoop>{},
|
||||
Number<1>{},
|
||||
Number<HoPerThread>{},
|
||||
Number<WoPerThread>{},
|
||||
Number<E2>{}));
|
||||
|
||||
static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
|
||||
Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
|
||||
|
||||
using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
|
||||
FloatA,
|
||||
BlockMatrixA,
|
||||
decltype(a_thread_mtx_),
|
||||
Sequence<EPerThreadLoop, KPerThreadSubC>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
ThreadGemmADataPerRead_K,
|
||||
1>;
|
||||
Number<KPerThreadLoop>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
|
||||
|
||||
__device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
|
||||
: c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
|
||||
a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)}
|
||||
: c_thread_origin_data_idx_{GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id())},
|
||||
a_thread_copy_{make_tuple(0, c_thread_origin_data_idx_[I0] * KPerThread, 0)}
|
||||
{
|
||||
static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
|
||||
BlockMatrixB::IsKnownAtCompileTime() &&
|
||||
ThreadMatrixC::IsKnownAtCompileTime(),
|
||||
static_assert(ABlockDesc_E1_K1_E2::IsKnownAtCompileTime() &&
|
||||
BBlockDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
|
||||
CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
|
||||
"wrong! Desc should be known at compile-time");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
static_assert(
|
||||
ABlockDesc_E1_K1_E2{}.GetLength(I0) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I0) &&
|
||||
ABlockDesc_E1_K1_E2{}.GetLength(I2) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I4),
|
||||
"wrong! E dimension not consistent\n");
|
||||
|
||||
static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
|
||||
"wrong! K dimension not consistent\n");
|
||||
static_assert(E1 % EPerThreadLoop == 0, "");
|
||||
static_assert(KPerThread % KPerThreadLoop == 0, "");
|
||||
|
||||
constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
|
||||
constexpr index_t H = BlockMatrixB{}.GetLength(I2);
|
||||
constexpr index_t W = BlockMatrixB{}.GetLength(I3);
|
||||
|
||||
static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
|
||||
static_assert(KPerBlock % KPerThread == 0 && HoPerBlock % HoPerThread == 0 &&
|
||||
WoPerBlock % WoPerThread == 0,
|
||||
"wrong! Cannot evenly divide work among\n");
|
||||
|
||||
constexpr auto KThreadCluster = K / KPerThread;
|
||||
constexpr auto HThreadCluster = H / HPerThread;
|
||||
constexpr auto WThreadCluster = W / WPerThread;
|
||||
constexpr auto KThreadCluster = KPerBlock / KPerThread;
|
||||
constexpr auto HThreadCluster = HoPerBlock / HoPerThread;
|
||||
constexpr auto WThreadCluster = WoPerBlock / WoPerThread;
|
||||
|
||||
static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
|
||||
"wrong! wrong blocksize\n");
|
||||
}
|
||||
|
||||
__device__ static constexpr auto GetThreadMatrixCLengths()
|
||||
__device__ static constexpr auto GetCThreadDesc_K_N_Ho_WoLengths()
|
||||
{
|
||||
return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
|
||||
return Sequence<KPerThread, I1, HoPerThread, WoPerThread>{};
|
||||
}
|
||||
|
||||
__device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
|
||||
__device__ static CIndex GetBeginOfCThreadDesc_K_N_Ho_Wo(index_t thread_id)
|
||||
{
|
||||
constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
|
||||
constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
|
||||
constexpr auto K0 = KPerBlock / KPerThread;
|
||||
constexpr auto N0 = I1;
|
||||
constexpr auto H0 = HoPerBlock / HoPerThread;
|
||||
constexpr auto W0 = WoPerBlock / WoPerThread;
|
||||
|
||||
constexpr auto num_w_threads = W / WPerThread;
|
||||
constexpr auto num_h_threads = H / HPerThread;
|
||||
constexpr auto num_hw_threads = num_w_threads * num_h_threads;
|
||||
constexpr auto c_threadid_to_k_n_h_w_thread_cluster_adaptor =
|
||||
make_single_stage_tensor_adaptor(
|
||||
make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))),
|
||||
make_tuple(Sequence<0, 1, 2, 3>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
index_t k_thread_id = thread_id / num_hw_threads;
|
||||
index_t hw_thread_id = thread_id % num_hw_threads;
|
||||
const auto c_k_n_h_w_thread_cluster_idx =
|
||||
c_threadid_to_k_n_h_w_thread_cluster_adaptor.CalculateBottomIndex(
|
||||
make_multi_index(thread_id));
|
||||
|
||||
index_t h_thread_id = hw_thread_id / num_w_threads;
|
||||
index_t w_thread_id = hw_thread_id % num_w_threads;
|
||||
|
||||
return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
|
||||
return c_k_n_h_w_thread_cluster_idx;
|
||||
}
|
||||
|
||||
template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
|
||||
@@ -116,19 +116,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
|
||||
is_same<remove_cvref_t<typename CThreadBuffer::type>, remove_cvref_t<FloatC>>::value &&
|
||||
"wrong! inconsistent type");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
|
||||
constexpr auto a_block_mtx = BlockMatrixA{};
|
||||
|
||||
constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
|
||||
|
||||
// HACK: fix this @Jing Zhang
|
||||
constexpr auto HoPerThreadSubC = 2;
|
||||
constexpr auto WoPerThreadSubC = 2;
|
||||
|
||||
static_assert(KPerThread % KPerThreadSubC == 0, "");
|
||||
static_assert(HPerThread % HoPerThreadSubC == 0, "");
|
||||
static_assert(WPerThread % WoPerThreadSubC == 0, "");
|
||||
constexpr auto a_block_mtx = ABlockDesc_E1_K1_E2{};
|
||||
|
||||
// thread A buffer for GEMM
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
|
||||
@@ -139,42 +127,46 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
|
||||
FloatC,
|
||||
decltype(a_thread_mtx_),
|
||||
decltype(b_thread_mtx_),
|
||||
decltype(c_thread_mtx_),
|
||||
HoPerThreadSubC,
|
||||
WoPerThreadSubC>{};
|
||||
decltype(c_thread_mtx_)>{};
|
||||
|
||||
static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) {
|
||||
static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) {
|
||||
static_for<0, E1, EPerThreadLoop>{}([&](auto e_begin) {
|
||||
static_for<0, KPerThread, KPerThreadLoop>{}([&](auto k_begin) {
|
||||
a_thread_copy_.Run(a_block_mtx,
|
||||
make_tuple(e_begin, k_begin),
|
||||
make_tuple(e_begin, k_begin, I0),
|
||||
a_block_buf,
|
||||
a_thread_mtx_,
|
||||
make_tuple(I0, I0),
|
||||
make_tuple(I0, I0, I0),
|
||||
a_thread_buf);
|
||||
|
||||
static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) {
|
||||
static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) {
|
||||
threadwise_gemm.Run(a_thread_buf,
|
||||
make_tuple(I0, I0),
|
||||
b_thread_buf,
|
||||
make_tuple(e_begin, I0, h_begin, w_begin),
|
||||
c_thread_buf,
|
||||
make_tuple(k_begin, I0, h_begin, w_begin));
|
||||
});
|
||||
});
|
||||
threadwise_gemm.Run(a_thread_buf,
|
||||
make_tuple(I0, I0, I0),
|
||||
b_thread_buf,
|
||||
make_tuple(e_begin, I0, I0, I0, I0),
|
||||
c_thread_buf,
|
||||
make_tuple(k_begin, I0, I0, I0));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename ABlockSliceMoveStepIdx>
|
||||
__device__ void MoveASliceWindow(const BlockMatrixA&,
|
||||
const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
|
||||
__device__ void MoveABlockSliceWindow(const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
|
||||
{
|
||||
a_thread_copy_.MoveSrcSliceWindow(BlockMatrixA{}, a_block_slice_move_step_idx);
|
||||
a_thread_copy_.MoveSrcSliceWindow(ABlockDesc_E1_K1_E2{}, a_block_slice_move_step_idx);
|
||||
}
|
||||
|
||||
private:
|
||||
MatrixIndex c_thread_begin_mtx_idx_;
|
||||
using AThreadCopy =
|
||||
ThreadwiseTensorSliceTransfer_v4<FloatA,
|
||||
FloatA,
|
||||
ABlockDesc_E1_K1_E2,
|
||||
decltype(a_thread_mtx_),
|
||||
Sequence<EPerThreadLoop, KPerThreadLoop, E2>,
|
||||
Sequence<0, 1, 2>,
|
||||
2,
|
||||
E2,
|
||||
E2>;
|
||||
|
||||
CIndex c_thread_origin_data_idx_;
|
||||
|
||||
AThreadCopy a_thread_copy_;
|
||||
};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -9,21 +9,22 @@ namespace ck {
|
||||
// C[M, N] += transpose(A[K, M]) * B[K, N]
|
||||
// Element of matrix can be vectorized data
|
||||
// Assume:
|
||||
// 1. ADesc, BDesc, CDesc are known at compile-time
|
||||
// 1. AThreadDesc_E1_K_E2, BThreadDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo are known at
|
||||
// compile-time
|
||||
// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
|
||||
template <typename FloatA,
|
||||
typename FloatB,
|
||||
typename FloatC,
|
||||
typename ADesc,
|
||||
typename BDesc,
|
||||
typename CDesc,
|
||||
index_t H,
|
||||
index_t W,
|
||||
typename enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
|
||||
CDesc::IsKnownAtCompileTime(),
|
||||
typename AThreadDesc_E1_K_E2,
|
||||
typename BThreadDesc_E1_N_Ho_Wo_E2,
|
||||
typename CThreadDesc_K_N_Ho_Wo,
|
||||
typename enable_if<AThreadDesc_E1_K_E2::IsKnownAtCompileTime() &&
|
||||
BThreadDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
|
||||
CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
|
||||
bool>::type = false>
|
||||
struct ThreadwiseGemmDlops_km_kn_mn_v3
|
||||
{
|
||||
|
||||
template <typename ABuffer,
|
||||
typename AOriginIdx,
|
||||
typename BBuffer,
|
||||
@@ -37,8 +38,10 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
|
||||
CBuffer& c_buf,
|
||||
COriginIdx)
|
||||
{
|
||||
static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
|
||||
CDesc::IsKnownAtCompileTime(),
|
||||
|
||||
static_assert(AThreadDesc_E1_K_E2::IsKnownAtCompileTime() &&
|
||||
BThreadDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
|
||||
CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
|
||||
"wrong! Desc should be known at compile-time");
|
||||
|
||||
static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value &&
|
||||
@@ -54,102 +57,107 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto E = ADesc{}.GetLength(I0);
|
||||
constexpr auto K = ADesc{}.GetLength(I1);
|
||||
constexpr auto E1 = AThreadDesc_E1_K_E2{}.GetLength(I0);
|
||||
constexpr auto K = AThreadDesc_E1_K_E2{}.GetLength(I1);
|
||||
constexpr auto E2 = AThreadDesc_E1_K_E2{}.GetLength(I2);
|
||||
|
||||
constexpr auto Ho = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
|
||||
constexpr auto Wo = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
|
||||
|
||||
constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
|
||||
constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
|
||||
constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
|
||||
|
||||
static_for<0, E, 1>{}([&](auto e) {
|
||||
if constexpr((Ho % 2 == 0) && (Wo % 2 == 0))
|
||||
{
|
||||
constexpr auto SubHW = 2;
|
||||
|
||||
static_for<0, K, 1>{}([&](auto k) {
|
||||
constexpr index_t a_offset =
|
||||
ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k));
|
||||
static_for<0, Ho, SubHW>{}([&](auto h) {
|
||||
static_for<0, Wo, SubHW>{}([&](auto w) {
|
||||
static_for<0, E1, 1>{}([&](auto e1) {
|
||||
static_for<0, E2, 1>{}([&](auto e2) {
|
||||
constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
|
||||
a_origin_idx + make_tuple(e1, k, e2));
|
||||
|
||||
if constexpr(H == 2 && W == 2)
|
||||
{
|
||||
constexpr index_t b_offset_0 =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
|
||||
constexpr index_t b_offset_1 =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1));
|
||||
constexpr index_t b_offset_2 =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
|
||||
constexpr index_t b_offset_3 =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1));
|
||||
constexpr index_t b0_offset =
|
||||
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
|
||||
b_origin_idx + make_tuple(e1, 0, h, w, e2));
|
||||
|
||||
constexpr index_t c_offset_0 =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
|
||||
constexpr index_t c_offset_1 =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1));
|
||||
constexpr index_t c_offset_2 =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
|
||||
constexpr index_t c_offset_3 =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1));
|
||||
constexpr index_t b1_offset =
|
||||
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
|
||||
b_origin_idx + make_tuple(e1, 0, h, w + 1, e2));
|
||||
|
||||
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
|
||||
b_buf[Number<b_offset_0>{}],
|
||||
b_buf[Number<b_offset_1>{}],
|
||||
b_buf[Number<b_offset_2>{}],
|
||||
b_buf[Number<b_offset_3>{}],
|
||||
c_buf(Number<c_offset_0>{}),
|
||||
c_buf(Number<c_offset_1>{}),
|
||||
c_buf(Number<c_offset_2>{}),
|
||||
c_buf(Number<c_offset_3>{}));
|
||||
}
|
||||
else if constexpr(H == 4 && W == 1)
|
||||
{
|
||||
constexpr index_t b_offset_0 =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
|
||||
constexpr index_t b_offset_1 =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
|
||||
constexpr index_t b_offset_2 =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0));
|
||||
constexpr index_t b_offset_3 =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0));
|
||||
constexpr index_t b2_offset =
|
||||
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
|
||||
b_origin_idx + make_tuple(e1, 0, h + 1, w, e2));
|
||||
|
||||
constexpr index_t c_offset_0 =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
|
||||
constexpr index_t c_offset_1 =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
|
||||
constexpr index_t c_offset_2 =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0));
|
||||
constexpr index_t c_offset_3 =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0));
|
||||
constexpr index_t b3_offset =
|
||||
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
|
||||
b_origin_idx + make_tuple(e1, 0, h + 1, w + 1, e2));
|
||||
|
||||
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
|
||||
b_buf[Number<b_offset_0>{}],
|
||||
b_buf[Number<b_offset_1>{}],
|
||||
b_buf[Number<b_offset_2>{}],
|
||||
b_buf[Number<b_offset_3>{}],
|
||||
c_buf(Number<c_offset_0>{}),
|
||||
c_buf(Number<c_offset_1>{}),
|
||||
c_buf(Number<c_offset_2>{}),
|
||||
c_buf(Number<c_offset_3>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
static_for<0, H, 1>{}([&](auto h) {
|
||||
static_for<0, W, 1>{}([&](auto w) {
|
||||
constexpr index_t b_offset =
|
||||
BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w));
|
||||
constexpr index_t c0_offset =
|
||||
CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
|
||||
make_tuple(k, 0, h, w));
|
||||
|
||||
constexpr index_t c_offset =
|
||||
CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));
|
||||
constexpr index_t c1_offset =
|
||||
CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
|
||||
c_origin_idx + make_tuple(k, 0, h, w + 1));
|
||||
|
||||
#if 0
|
||||
c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
|
||||
a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
|
||||
#else
|
||||
amd_assembly_inner_product(a_buf[Number<a_offset>{}],
|
||||
b_buf[Number<b_offset>{}],
|
||||
c_buf(Number<c_offset>{}));
|
||||
#endif
|
||||
constexpr index_t c2_offset =
|
||||
CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
|
||||
c_origin_idx + make_tuple(k, 0, h + 1, w));
|
||||
|
||||
constexpr index_t c3_offset =
|
||||
CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
|
||||
c_origin_idx + make_tuple(k, 0, h + 1, w + 1));
|
||||
|
||||
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
|
||||
b_buf[Number<b0_offset>{}],
|
||||
b_buf[Number<b1_offset>{}],
|
||||
b_buf[Number<b2_offset>{}],
|
||||
b_buf[Number<b3_offset>{}],
|
||||
c_buf(Number<c0_offset>{}),
|
||||
c_buf(Number<c1_offset>{}),
|
||||
c_buf(Number<c2_offset>{}),
|
||||
c_buf(Number<c3_offset>{}));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
static_for<0, K, 1>{}([&](auto k) {
|
||||
static_for<0, Ho, 1>{}([&](auto h) {
|
||||
static_for<0, Wo, 1>{}([&](auto w) {
|
||||
static_for<0, E1, 1>{}([&](auto e1) {
|
||||
static_for<0, E2, 1>{}([&](auto e2) {
|
||||
constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
|
||||
a_origin_idx + make_tuple(e1, k, e2));
|
||||
|
||||
constexpr index_t b_offset =
|
||||
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
|
||||
b_origin_idx + make_tuple(e1, 0, h, w, e2));
|
||||
|
||||
constexpr index_t c_offset =
|
||||
CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
|
||||
make_tuple(k, 0, h, w));
|
||||
|
||||
inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
|
||||
b_buf[Number<b_offset>{}],
|
||||
c_buf(Number<c_offset>{}));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -217,6 +217,22 @@ struct ThreadwiseTensorSliceTransfer_v1r3
|
||||
is_dst_valid,
|
||||
dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
|
||||
}
|
||||
else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
|
||||
{
|
||||
|
||||
typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
|
||||
tmp.template AsType<dst_vector_t>()(Number<0>{}) =
|
||||
dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
|
||||
|
||||
static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
|
||||
dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
|
||||
});
|
||||
|
||||
dst_buf.template Set<dst_vector_t>(
|
||||
dst_coord_.GetOffset(),
|
||||
is_dst_valid,
|
||||
dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
|
||||
}
|
||||
|
||||
constexpr auto move_on_dim = [&]() constexpr
|
||||
{
|
||||
@@ -666,6 +682,25 @@ struct ThreadwiseTensorSliceTransfer_v2
|
||||
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
|
||||
}
|
||||
|
||||
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
|
||||
template <typename SrcMoveSliceWindowStepHack>
|
||||
__device__ void
|
||||
MoveSrcSliceWindow(const SrcDesc& src_desc,
|
||||
const Index& src_slice_origin_step_idx,
|
||||
const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
|
||||
{
|
||||
// if src coord was not reset by RunRead(), then need to adjust the step here
|
||||
const auto adjusted_step_idx =
|
||||
SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
|
||||
: src_slice_origin_step_idx + GetSrcCoordinateResetStep();
|
||||
|
||||
// is it OK to construct a new step every time?
|
||||
const auto adjusted_step = make_tensor_coordinate_step(
|
||||
src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
|
||||
|
||||
move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
|
||||
}
|
||||
|
||||
private:
|
||||
SrcCoord src_coord_;
|
||||
}; // namespace ck
|
||||
|
||||
@@ -591,6 +591,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
|
||||
}
|
||||
else if constexpr(N == 8)
|
||||
{
|
||||
#if 0
|
||||
vector_type<half_t, 8> tmp{src_thread_data};
|
||||
|
||||
llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
|
||||
@@ -604,6 +605,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
|
||||
dst_thread_addr_offset,
|
||||
dst_wave_addr_offset + 4 * sizeof(half_t),
|
||||
0);
|
||||
#else
|
||||
llvm_amdgcn_raw_buffer_store_fp32x4(as_type<float4_t>(src_thread_data),
|
||||
dst_wave_buffer_resource,
|
||||
dst_thread_addr_offset,
|
||||
dst_wave_addr_offset,
|
||||
0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same<T, ushort>::value)
|
||||
|
||||
@@ -96,6 +96,7 @@
|
||||
// pass tensor descriptor by value or void*
|
||||
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
|
||||
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
|
||||
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
|
||||
|
||||
// merge transformation use magic number division
|
||||
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
|
||||
@@ -128,7 +129,15 @@ namespace ck {
|
||||
enum InMemoryDataOperationEnum_t
|
||||
{
|
||||
Set,
|
||||
AtomicAdd
|
||||
AtomicAdd,
|
||||
Add
|
||||
};
|
||||
|
||||
enum ActivTypeEnum_t
|
||||
{
|
||||
None = 0,
|
||||
LeakyRelu,
|
||||
Sigmoid
|
||||
};
|
||||
|
||||
// index type
|
||||
|
||||
@@ -13,16 +13,25 @@ include_directories(BEFORE
|
||||
)
|
||||
|
||||
set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
|
||||
set(CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_fwd_driver_offline_nchwc.cpp)
|
||||
set(CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_add_fwd_driver_offline_nchwc.cpp)
|
||||
set(CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_maxpool_fwd_driver_offline_nchwc.cpp)
|
||||
set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
|
||||
set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp)
|
||||
set(GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp)
|
||||
|
||||
add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
|
||||
add_executable(conv_fwd_driver_offline_nchwc ${CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
|
||||
add_executable(conv_add_fwd_driver_offline_nchwc ${CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
|
||||
add_executable(conv_maxpool_fwd_driver_offline_nchwc ${CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
|
||||
add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
|
||||
add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE})
|
||||
add_executable(gemm_driver_offline ${GEMM_DRIVER_OFFLINE_SOURCE})
|
||||
|
||||
target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
|
||||
target_link_libraries(conv_fwd_driver_offline_nchwc PRIVATE host_tensor)
|
||||
target_link_libraries(conv_add_fwd_driver_offline_nchwc PRIVATE host_tensor)
|
||||
target_link_libraries(conv_maxpool_fwd_driver_offline_nchwc PRIVATE host_tensor)
|
||||
target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
|
||||
target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor)
|
||||
target_link_libraries(gemm_driver_offline PRIVATE host_tensor)
|
||||
|
||||
@@ -0,0 +1,220 @@
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
template <typename TInWei,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
ck::ActivTypeEnum_t activ_type,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename AddLengths,
|
||||
typename OutLengths,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
|
||||
const InLengths& in_n_c0_hi_wi_c1_lengths,
|
||||
const WeiLengths& wei_k_c0_y_x_c1_lengths,
|
||||
const AddLengths& add_n_k0_hox2_wox2_k1_lengths,
|
||||
const OutLengths& out_n_k0_ho_wo_k1_lengths,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const Tensor<TInWei>& in_n_c0_hi_wi_c1,
|
||||
const Tensor<TInWei>& wei_k_c0_y_x_c1,
|
||||
const Tensor<TOut>& bias_k0_k1,
|
||||
const Tensor<TOut>& add_n_k0_hox2_wox2_k1,
|
||||
Tensor<TOut>& add_n_k0_hox2_wox2_k1_out,
|
||||
ck::index_t nrepeat)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
std::cout << __func__ << std::endl;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
const auto N = out_n_k0_ho_wo_k1_lengths[I0];
|
||||
const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
|
||||
const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
|
||||
const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
|
||||
const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
|
||||
|
||||
const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
|
||||
const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
|
||||
const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
|
||||
const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
|
||||
|
||||
const auto K = wei_k_c0_y_x_c1_lengths[I0];
|
||||
const auto Y = wei_k_c0_y_x_c1_lengths[I2];
|
||||
const auto X = wei_k_c0_y_x_c1_lengths[I3];
|
||||
|
||||
const auto Hox2 = add_n_k0_hox2_wox2_k1_lengths[I2];
|
||||
const auto Wox2 = add_n_k0_hox2_wox2_k1_lengths[I3];
|
||||
|
||||
DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
|
||||
in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
|
||||
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
|
||||
DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
|
||||
DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
|
||||
add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
|
||||
|
||||
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
|
||||
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
|
||||
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
|
||||
add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
|
||||
|
||||
constexpr index_t InWeiVectorSize = 8;
|
||||
|
||||
if(C1 % InWeiVectorSize != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
|
||||
}
|
||||
|
||||
#if 0
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
constexpr index_t KPerBlock = 32;
|
||||
constexpr index_t HoPerBlock = 8;
|
||||
constexpr index_t WoPerBlock = 64;
|
||||
|
||||
constexpr index_t E1 = C0 * 9;
|
||||
constexpr index_t E2 = 1;
|
||||
constexpr index_t E1PerBlock = C0;
|
||||
|
||||
constexpr index_t KPerThread = 16;
|
||||
constexpr index_t HoPerThread = 2;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
constexpr index_t EPerThread = 1;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>;
|
||||
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
|
||||
|
||||
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
|
||||
|
||||
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
|
||||
|
||||
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
|
||||
#elif 1
|
||||
constexpr auto BlockSize = 64;
|
||||
|
||||
constexpr auto KPerBlock = 8;
|
||||
constexpr auto HoPerBlock = 8;
|
||||
constexpr auto WoPerBlock = 32;
|
||||
|
||||
constexpr auto E1 = 2 * 9;
|
||||
constexpr auto E2 = 1;
|
||||
constexpr auto K2 = 2;
|
||||
constexpr auto E1PerBlock = 2;
|
||||
|
||||
constexpr auto KPerThread = KPerBlock;
|
||||
constexpr auto HoPerThread = 2;
|
||||
constexpr auto WoPerThread = 2;
|
||||
constexpr auto EPerThread = 1;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
|
||||
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
|
||||
Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
|
||||
|
||||
constexpr auto ABlockTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr auto ABlockTransferDstScalarPerVector_E2 = E2;
|
||||
constexpr auto BThreadTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr auto CThreadTransferDstScalarPerVector_K = InWeiVectorSize;
|
||||
#endif
|
||||
|
||||
const auto in_n_c0_hi_wi_c1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
|
||||
const auto wei_k_c0_y_x_c1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
|
||||
const auto add_n_k0_hox2_wox2_k1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1));
|
||||
const auto out_n_k0_ho_wo_k1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
|
||||
|
||||
constexpr auto conv_driver =
|
||||
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add<
|
||||
BlockSize,
|
||||
typename vector_type<TInWei, InWeiVectorSize>::type,
|
||||
TAcc,
|
||||
TOut,
|
||||
E1,
|
||||
E2,
|
||||
K2,
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
E1PerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferSrcScalarPerVector_E2,
|
||||
ABlockTransferDstScalarPerVector_E2,
|
||||
BThreadTransferSrcScalarPerVector_E2,
|
||||
CThreadTransferDstScalarPerVector_K,
|
||||
activ_type>{};
|
||||
|
||||
std::cerr << "conv_bias_activ_resize_add_input_"
|
||||
<< "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
|
||||
<< "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_addout_n" << N << "k" << K0
|
||||
<< "h" << Ho * 2 << "w" << Wo * 2 << "k" << K1 << std::endl;
|
||||
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
|
||||
const auto ave_time =
|
||||
conv_driver.Run(wei_k_c0_y_x_c1_desc,
|
||||
in_n_c0_hi_wi_c1_desc,
|
||||
out_n_k0_ho_wo_k1_desc,
|
||||
add_n_k0_hox2_wox2_k1_desc,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads,
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
|
||||
nrepeat);
|
||||
|
||||
{
|
||||
float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
|
||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||
|
||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
|
||||
|
||||
conv_driver.Run(wei_k_c0_y_x_c1_desc,
|
||||
in_n_c0_hi_wi_c1_desc,
|
||||
out_n_k0_ho_wo_k1_desc,
|
||||
add_n_k0_hox2_wox2_k1_desc,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads,
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
|
||||
0);
|
||||
|
||||
add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data());
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
template <typename TInWei,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
ck::ActivTypeEnum_t activ_type,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename OutLengths,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
|
||||
const InLengths& in_n_c0_hi_wi_c1_lengths,
|
||||
const WeiLengths& wei_k_c0_y_x_c1_lengths,
|
||||
const OutLengths& out_n_k0_ho_wo_k1_lengths,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const Tensor<TInWei>& in_n_c0_hi_wi_c1,
|
||||
const Tensor<TInWei>& wei_k_c0_y_x_c1,
|
||||
const Tensor<TOut>& bias_k0_k1,
|
||||
Tensor<TOut>& out_n_k0_ho_wo_k1,
|
||||
ck::index_t nrepeat)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
std::cout << __func__ << std::endl;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
const auto N = out_n_k0_ho_wo_k1_lengths[I0];
|
||||
const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
|
||||
const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
|
||||
const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
|
||||
const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
|
||||
|
||||
const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
|
||||
const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
|
||||
const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
|
||||
const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
|
||||
|
||||
const auto K = wei_k_c0_y_x_c1_lengths[I0];
|
||||
const auto Y = wei_k_c0_y_x_c1_lengths[I2];
|
||||
const auto X = wei_k_c0_y_x_c1_lengths[I3];
|
||||
|
||||
DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
|
||||
in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
|
||||
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
|
||||
DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
|
||||
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
|
||||
out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
|
||||
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
|
||||
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
|
||||
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
|
||||
|
||||
constexpr index_t InWeiVectorSize = 8;
|
||||
|
||||
if(C1 % InWeiVectorSize != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
|
||||
}
|
||||
|
||||
#if 0
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
constexpr index_t KPerBlock = 32;
|
||||
constexpr index_t HoPerBlock = 8;
|
||||
constexpr index_t WoPerBlock = 64;
|
||||
|
||||
constexpr index_t E1 = C0 * 9;
|
||||
constexpr index_t E2 = 1;
|
||||
constexpr index_t E1PerBlock = C0;
|
||||
|
||||
constexpr index_t KPerThread = 16;
|
||||
constexpr index_t HoPerThread = 2;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
constexpr index_t EPerThread = 1;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>;
|
||||
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
|
||||
|
||||
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
|
||||
|
||||
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
|
||||
|
||||
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
|
||||
#elif 1
|
||||
constexpr index_t BlockSize = 64;
|
||||
|
||||
constexpr index_t KPerBlock = 8;
|
||||
constexpr index_t HoPerBlock = 8;
|
||||
constexpr index_t WoPerBlock = 32;
|
||||
|
||||
constexpr index_t E1 = 2 * 9;
|
||||
constexpr index_t E2 = 1;
|
||||
constexpr index_t K2 = 2;
|
||||
constexpr index_t E1PerBlock = 2;
|
||||
|
||||
constexpr index_t KPerThread = KPerBlock;
|
||||
constexpr index_t HoPerThread = 2;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
constexpr index_t EPerThread = 1;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
|
||||
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
|
||||
Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
|
||||
|
||||
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
|
||||
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr index_t CThreadTransferDstScalarPerVector_K = InWeiVectorSize;
|
||||
#endif
|
||||
|
||||
if(KPerThread % InWeiVectorSize != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
|
||||
}
|
||||
|
||||
const auto in_n_c0_hi_wi_c1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
|
||||
const auto wei_k_c0_y_x_c1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
|
||||
const auto out_n_k0_ho_wo_k1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
|
||||
|
||||
constexpr auto conv_driver =
|
||||
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad<
|
||||
BlockSize,
|
||||
typename vector_type<TInWei, InWeiVectorSize>::type,
|
||||
TAcc,
|
||||
TOut,
|
||||
E1,
|
||||
E2,
|
||||
K2,
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
E1PerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferSrcScalarPerVector_E2,
|
||||
ABlockTransferDstScalarPerVector_E2,
|
||||
BThreadTransferSrcScalarPerVector_E2,
|
||||
CThreadTransferDstScalarPerVector_K,
|
||||
activ_type>{};
|
||||
|
||||
std::cerr << "conv_bias_activ_input_"
|
||||
<< "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
|
||||
<< "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
|
||||
<< "h" << Ho << "w" << Wo << "k" << K1 << std::endl;
|
||||
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
|
||||
const auto ave_time =
|
||||
conv_driver.Run(wei_k_c0_y_x_c1_desc,
|
||||
in_n_c0_hi_wi_c1_desc,
|
||||
out_n_k0_ho_wo_k1_desc,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads,
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
|
||||
nrepeat);
|
||||
|
||||
{
|
||||
float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
|
||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||
|
||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
|
||||
}
|
||||
@@ -1,190 +0,0 @@
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
|
||||
|
||||
template <typename TInWei,
|
||||
ck::index_t InWeiVectorSize,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename OutLengths,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
|
||||
const InLengths& in_n_c_hi_wi_lengths,
|
||||
const WeiLengths& wei_k_c_y_x_lengths,
|
||||
const OutLengths& out_n_k_ho_wo_lengths,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const Tensor<TInWei>& in_n_c_hi_wi,
|
||||
const Tensor<TInWei>& wei_k_c_y_x,
|
||||
Tensor<TOut>& out_n_k_ho_wo,
|
||||
ck::index_t /* nrepeat */)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
std::cout << __func__ << std::endl;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
const auto N = out_n_k_ho_wo_lengths[I0];
|
||||
const auto K = out_n_k_ho_wo_lengths[I1];
|
||||
const auto C = wei_k_c_y_x_lengths[I1];
|
||||
|
||||
const auto Hi = in_n_c_hi_wi_lengths[I2];
|
||||
const auto Wi = in_n_c_hi_wi_lengths[I3];
|
||||
|
||||
const auto Ho = out_n_k_ho_wo_lengths[I2];
|
||||
const auto Wo = out_n_k_ho_wo_lengths[I3];
|
||||
|
||||
const auto Y = wei_k_c_y_x_lengths[I2];
|
||||
const auto X = wei_k_c_y_x_lengths[I3];
|
||||
|
||||
const auto C0 = C / Number<InWeiVectorSize>{};
|
||||
const auto C1 = Number<InWeiVectorSize>{};
|
||||
|
||||
const auto K0 = K / Number<InWeiVectorSize>{};
|
||||
const auto K1 = Number<InWeiVectorSize>{};
|
||||
|
||||
Tensor<TInWei> in_n_c0_hi_wi_c1(
|
||||
HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
|
||||
Tensor<TInWei> wei_k_c0_y_x_c1(
|
||||
HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
|
||||
Tensor<TOut> out_n_k0_ho_wo_k1(
|
||||
HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1}));
|
||||
|
||||
auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
|
||||
in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
|
||||
in_n_c_hi_wi(n, c, hi, wi);
|
||||
};
|
||||
|
||||
auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
|
||||
wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
|
||||
wei_k_c_y_x(k, c, y, x);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
|
||||
make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
|
||||
|
||||
DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
|
||||
in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
|
||||
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
|
||||
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
|
||||
out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
|
||||
|
||||
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
|
||||
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
|
||||
|
||||
const auto in_n_c0_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi));
|
||||
const auto wei_k_c0_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X));
|
||||
const auto out_n_k0_ho_wo_k1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
|
||||
|
||||
#if 1
|
||||
// cdata = 64, BlockSize = 64, 16x8x32x4
|
||||
constexpr index_t BlockSize = 64;
|
||||
|
||||
constexpr index_t KPerBlock = 16;
|
||||
constexpr index_t HoPerBlock = 8;
|
||||
constexpr index_t WoPerBlock = 32;
|
||||
constexpr index_t EPerBlock = 1;
|
||||
|
||||
constexpr index_t KPerThread = KPerBlock;
|
||||
constexpr index_t HoPerThread = 2;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
constexpr index_t EPerThread = EPerBlock;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_E_K = Sequence<3, 1>;
|
||||
using ABlockTransferThreadClusterLengths_E_K = Sequence<3 * EPerBlock, KPerBlock>;
|
||||
|
||||
constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
|
||||
constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
|
||||
|
||||
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
|
||||
|
||||
constexpr index_t CThreadTransferDstScalarPerVector_W = 16;
|
||||
|
||||
static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
|
||||
#else
|
||||
constexpr index_t BlockSize = 64;
|
||||
|
||||
constexpr index_t KPerBlock = 16;
|
||||
constexpr index_t HoPerBlock = 8;
|
||||
constexpr index_t WoPerBlock = 32;
|
||||
constexpr index_t EPerBlock = 1;
|
||||
|
||||
constexpr index_t KPerThread = 16;
|
||||
constexpr index_t HoPerThread = 2;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
constexpr index_t EPerThread = EPerBlock;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>;
|
||||
using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, 16>;
|
||||
|
||||
constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
|
||||
constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
|
||||
|
||||
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
|
||||
|
||||
constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
|
||||
|
||||
static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
|
||||
#endif
|
||||
|
||||
constexpr auto conv_driver =
|
||||
#if 0
|
||||
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
|
||||
#else
|
||||
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
|
||||
#endif
|
||||
<BlockSize,
|
||||
typename vector_type<TInWei, InWeiVectorSize>::type,
|
||||
TAcc,
|
||||
TOut,
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
EPerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E_K,
|
||||
ABlockTransferThreadClusterLengths_E_K,
|
||||
ABlockTransferSrcScalarPerVector_E,
|
||||
ABlockTransferDstScalarPerVector_K,
|
||||
BThreadTransferSrcScalarPerVector_W,
|
||||
CThreadTransferDstScalarPerVector_W>{};
|
||||
|
||||
conv_driver.Run(wei_k_c0_y_x_desc,
|
||||
in_n_c0_hi_wi_desc,
|
||||
out_n_k0_ho_wo_k1_desc,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads,
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()));
|
||||
|
||||
out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
|
||||
|
||||
auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
|
||||
out_n_k_ho_wo(n, k, ho, wo) =
|
||||
out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)();
|
||||
}
|
||||
@@ -0,0 +1,212 @@
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
template <typename TInWei,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
ck::ActivTypeEnum_t activ_type,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename MaxLengths,
|
||||
typename OutLengths,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
|
||||
const InLengths& in_n_c0_hi_wi_c1_lengths,
|
||||
const WeiLengths& wei_k_c0_y_x_c1_lengths,
|
||||
const MaxLengths& max_n_k0_hx_wx_k1_lengths,
|
||||
const OutLengths& out_n_k0_ho_wo_k1_lengths,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const Tensor<TInWei>& in_n_c0_hi_wi_c1,
|
||||
const Tensor<TInWei>& wei_k_c0_y_x_c1,
|
||||
const Tensor<TOut>& bias_k0_k1,
|
||||
Tensor<TOut>& out_n_k0_ho_wo_k1,
|
||||
Tensor<TOut>& max_n_k0_hx_wx_k1,
|
||||
ck::index_t nrepeat)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
std::cout << __func__ << std::endl;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
const auto N = out_n_k0_ho_wo_k1_lengths[I0];
|
||||
const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
|
||||
const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
|
||||
const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
|
||||
const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
|
||||
|
||||
const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
|
||||
const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
|
||||
const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
|
||||
const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
|
||||
|
||||
const auto K = wei_k_c0_y_x_c1_lengths[I0];
|
||||
const auto Y = wei_k_c0_y_x_c1_lengths[I2];
|
||||
const auto X = wei_k_c0_y_x_c1_lengths[I3];
|
||||
|
||||
const auto Hx = max_n_k0_hx_wx_k1_lengths[I2];
|
||||
const auto Wx = max_n_k0_hx_wx_k1_lengths[I3];
|
||||
|
||||
DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
|
||||
in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
|
||||
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
|
||||
DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
|
||||
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
|
||||
out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
|
||||
DeviceMem max_n_k0_hx_wx_k1_device_buf(sizeof(TOut) *
|
||||
max_n_k0_hx_wx_k1.mDesc.GetElementSpace());
|
||||
|
||||
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
|
||||
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
|
||||
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
|
||||
max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data());
|
||||
|
||||
constexpr index_t InWeiVectorSize = 8;
|
||||
|
||||
if(C1 % InWeiVectorSize != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
|
||||
}
|
||||
|
||||
#if 0
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
constexpr index_t KPerBlock = 32;
|
||||
constexpr index_t HoPerBlock = 8;
|
||||
constexpr index_t WoPerBlock = 64;
|
||||
|
||||
constexpr index_t E1 = C0 * 9;
|
||||
constexpr index_t E2 = 1;
|
||||
constexpr index_t E1PerBlock = C0;
|
||||
|
||||
constexpr index_t KPerThread = 16;
|
||||
constexpr index_t HoPerThread = 2;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
constexpr index_t EPerThread = 1;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, E2>;
|
||||
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
|
||||
|
||||
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
|
||||
|
||||
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
|
||||
|
||||
constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
|
||||
#elif 1
|
||||
constexpr index_t BlockSize = 64;
|
||||
|
||||
constexpr index_t KPerBlock = 8;
|
||||
constexpr index_t HoPerBlock = 8;
|
||||
constexpr index_t WoPerBlock = 32;
|
||||
|
||||
constexpr index_t E1 = 2 * 9;
|
||||
constexpr index_t E2 = 1;
|
||||
constexpr index_t K2 = 2;
|
||||
constexpr index_t E1PerBlock = 2;
|
||||
|
||||
constexpr index_t KPerThread = KPerBlock;
|
||||
constexpr index_t HoPerThread = 2;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
constexpr index_t EPerThread = 1;
|
||||
|
||||
using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
|
||||
using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
|
||||
Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
|
||||
|
||||
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
|
||||
constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
|
||||
constexpr index_t CThreadTransferDstScalarPerVector_K = InWeiVectorSize;
|
||||
#endif
|
||||
|
||||
if(KPerThread % InWeiVectorSize != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
|
||||
}
|
||||
|
||||
const auto in_n_c0_hi_wi_c1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
|
||||
const auto wei_k_c0_y_x_c1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
|
||||
const auto max_n_k0_hx_wx_k1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1));
|
||||
const auto out_n_k0_ho_wo_k1_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
|
||||
|
||||
constexpr auto conv_driver =
|
||||
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool<
|
||||
BlockSize,
|
||||
typename vector_type<TInWei, InWeiVectorSize>::type,
|
||||
TAcc,
|
||||
TOut,
|
||||
E1,
|
||||
E2,
|
||||
K2,
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
E1PerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferSrcScalarPerVector_E2,
|
||||
ABlockTransferDstScalarPerVector_E2,
|
||||
BThreadTransferSrcScalarPerVector_E2,
|
||||
CThreadTransferDstScalarPerVector_K,
|
||||
activ_type>{};
|
||||
|
||||
std::cerr << "conv_bias_activ_maxpool_input_"
|
||||
<< "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
|
||||
<< "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
|
||||
<< "h" << Ho << "w" << Wo << "k" << K1 << "_maxpoolout_n" << N << "k" << K0 << "h"
|
||||
<< Ho / 2 << "w" << Wo / 2 << "k" << K1 << std::endl;
|
||||
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
|
||||
const auto ave_time =
|
||||
conv_driver.Run(wei_k_c0_y_x_c1_desc,
|
||||
in_n_c0_hi_wi_c1_desc,
|
||||
out_n_k0_ho_wo_k1_desc,
|
||||
max_n_k0_hx_wx_k1_desc,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads,
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
|
||||
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
|
||||
static_cast<TOut*>(max_n_k0_hx_wx_k1_device_buf.GetDeviceBuffer()),
|
||||
nrepeat);
|
||||
|
||||
{
|
||||
float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
|
||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||
|
||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
|
||||
max_n_k0_hx_wx_k1_device_buf.FromDevice(max_n_k0_hx_wx_k1.mData.data());
|
||||
}
|
||||
@@ -0,0 +1,565 @@
|
||||
#ifndef DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
|
||||
#define DRIVER_CONVOLUTION_ADD_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "gridwise_gemm_dlops_v3.hpp"
|
||||
|
||||
template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::index_t E1_,
|
||||
ck::index_t E2_,
|
||||
ck::index_t K2_,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t HoPerBlock,
|
||||
ck::index_t WoPerBlock,
|
||||
ck::index_t E1PerBlock,
|
||||
ck::index_t KPerThread,
|
||||
ck::index_t HoPerThread,
|
||||
ck::index_t WoPerThread,
|
||||
ck::index_t EPerThread,
|
||||
typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector_E2,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_E2,
|
||||
ck::index_t BThreadTransferSrcScalarPerVector_E2,
|
||||
ck::index_t CThreadTransferDstScalarPerVector_K,
|
||||
ck::ActivTypeEnum_t activ_type>
|
||||
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add
|
||||
{
|
||||
template <typename... Wei,
|
||||
typename... In,
|
||||
typename... Add,
|
||||
typename... Out,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
__host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
|
||||
const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
|
||||
const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
|
||||
const ck::TensorDescriptor<Add...>& add_n_k0_hox2_wox2_k1_global_desc,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const FloatAB* __restrict__ p_a_grid,
|
||||
const FloatAB* __restrict__ p_b_grid,
|
||||
const FloatC* __restrict__ p_bias_grid,
|
||||
FloatC* __restrict__ p_d_grid,
|
||||
const int nrepeat) const
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
const auto N = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
|
||||
const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
|
||||
const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
|
||||
const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
|
||||
// const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
|
||||
|
||||
const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
|
||||
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
|
||||
const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
|
||||
const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
|
||||
|
||||
const auto Hox2 = add_n_k0_hox2_wox2_k1_global_desc.GetLength(I2);
|
||||
const auto Wox2 = add_n_k0_hox2_wox2_k1_global_desc.GetLength(I3);
|
||||
|
||||
const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
|
||||
const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
|
||||
const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
|
||||
|
||||
const auto ConvStrideH = conv_strides[I0];
|
||||
const auto ConvStrideW = conv_strides[I1];
|
||||
|
||||
const auto ConvDilationH = conv_dilations[I0];
|
||||
const auto ConvDilationW = conv_dilations[I1];
|
||||
|
||||
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
|
||||
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
|
||||
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
|
||||
|
||||
const auto OutRightPadH = Hop - Ho;
|
||||
const auto OutRightPadW = Wop - Wo;
|
||||
|
||||
const auto OutRightPadHx = Number<OutRightPadH * 2>{};
|
||||
const auto OutRightPadWx = Number<OutRightPadW * 2>{};
|
||||
#else
|
||||
const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
|
||||
const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
|
||||
|
||||
const auto OutRightPadH = Hop - Ho;
|
||||
const auto OutRightPadW = Wop - Wo;
|
||||
|
||||
const auto OutRightPadHx = OutRightPadH * 2;
|
||||
const auto OutRightPadWx = OutRightPadW * 2;
|
||||
#endif
|
||||
|
||||
const auto InLeftPadH = in_left_pads[I0];
|
||||
const auto InLeftPadW = in_left_pads[I1];
|
||||
|
||||
const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
|
||||
const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
|
||||
|
||||
const auto E = C0 * Y * X;
|
||||
|
||||
constexpr auto E1 = Number<E1_>{};
|
||||
constexpr auto E2 = Number<E2_>{};
|
||||
constexpr auto K2 = Number<K2_>{};
|
||||
|
||||
const auto E0 = E / E1;
|
||||
|
||||
// weight tensor
|
||||
const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
|
||||
make_tuple(make_pass_through_transform(K),
|
||||
make_pass_through_transform(C0 * Y * X),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
|
||||
|
||||
const auto a_e0_e1_k_e2_grid_desc =
|
||||
transform_tensor_descriptor(a_e_k_e2_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
|
||||
make_pass_through_transform(K),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
// input tensor
|
||||
const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_pass_through_transform(C0),
|
||||
make_pad_transform(Hi, InLeftPadH, InRightPadH),
|
||||
make_pad_transform(Wi, InLeftPadW, InRightPadW),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
|
||||
|
||||
const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
|
||||
in_n_c0_hip_wip_e2_global_desc,
|
||||
make_tuple(
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(C0),
|
||||
make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
|
||||
make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(
|
||||
Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
|
||||
|
||||
const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
|
||||
in_n_c0_y_ho_x_wo_e2_global_desc,
|
||||
make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Hop),
|
||||
make_pass_through_transform(Wop),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(
|
||||
Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
|
||||
|
||||
const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
|
||||
in_e_n_ho_wo_e2_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Hop),
|
||||
make_pass_through_transform(Wop),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(
|
||||
Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
|
||||
|
||||
// output tensor
|
||||
const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
|
||||
make_tuple(make_merge_transform(make_tuple(K0, K1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pad_transform(Ho, I0, OutRightPadH),
|
||||
make_pad_transform(Wo, I0, OutRightPadW)),
|
||||
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
// add tensor
|
||||
const auto d_k_n_hopx2_wopx2_grid_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1)),
|
||||
make_tuple(make_merge_transform(make_tuple(K0, K1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pad_transform(Hox2, I0, OutRightPadHx),
|
||||
make_pad_transform(Wox2, I0, OutRightPadWx)),
|
||||
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
|
||||
|
||||
if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
|
||||
(E1 % E1PerBlock) == 0))
|
||||
{
|
||||
throw std::runtime_error("wrong! GEMM size no divisible");
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
|
||||
// hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
|
||||
constexpr auto a_e0_e1_k_e2_global_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
|
||||
|
||||
// hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
|
||||
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks =
|
||||
make_tuple(
|
||||
make_tuple(
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
|
||||
);
|
||||
|
||||
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
|
||||
|
||||
// hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
|
||||
constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
constexpr auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
// clang-format on
|
||||
|
||||
// GEMM
|
||||
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
|
||||
BlockSize,
|
||||
FloatAB,
|
||||
FloatAcc,
|
||||
FloatC,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
decltype(a_e0_e1_k_e2_grid_desc),
|
||||
decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
|
||||
decltype(c_k_n_hop_wop_grid_desc),
|
||||
decltype(d_k_n_hopx2_wopx2_grid_desc),
|
||||
E1,
|
||||
E2,
|
||||
K2,
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
E1PerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
Sequence<2, 3, 0, 1, 4>,
|
||||
Sequence<0, 1, 2, 3, 4>,
|
||||
4,
|
||||
ABlockTransferSrcScalarPerVector_E2,
|
||||
ABlockTransferDstScalarPerVector_E2,
|
||||
false, // don't move back src coordinate after threadwise copy
|
||||
Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
|
||||
9,
|
||||
BThreadTransferSrcScalarPerVector_E2,
|
||||
false, // don't move back src coordinate after threadwise copy, which will be fused with
|
||||
// MoveSrcSliceWindow() to save addr computation
|
||||
Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
|
||||
1,
|
||||
CThreadTransferDstScalarPerVector_K,
|
||||
decltype(a_e0_e1_k_e2_global_step_hacks),
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
|
||||
decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
|
||||
decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks),
|
||||
decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
|
||||
|
||||
const auto a_e0_e1_k0_k1_e2_grid_desc =
|
||||
GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
|
||||
const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
|
||||
GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
|
||||
const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
|
||||
GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
|
||||
const auto d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc =
|
||||
GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(
|
||||
d_k_n_hopx2_wopx2_grid_desc);
|
||||
|
||||
using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
|
||||
using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
|
||||
using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
|
||||
using DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2 =
|
||||
decltype(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc);
|
||||
|
||||
const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
|
||||
|
||||
const bool has_main_e0_block_loop = E0 > 1;
|
||||
|
||||
std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
|
||||
|
||||
const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
|
||||
GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
|
||||
|
||||
using CBlockIdToBlockClusterAdaptor_K_N_H_W =
|
||||
decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
|
||||
float ave_time = 0;
|
||||
|
||||
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
|
||||
|
||||
if(has_main_e0_block_loop)
|
||||
{
|
||||
const auto kernel = kernel_gemm_dlops_v3_resize_add<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
true,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_d_grid,
|
||||
a_e0_e1_k0_k1_e2_grid_desc,
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
|
||||
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc,
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto kernel = kernel_gemm_dlops_v3_resize_add<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
false,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_d_grid,
|
||||
a_e0_e1_k0_k1_e2_grid_desc,
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
|
||||
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc,
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
}
|
||||
|
||||
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
|
||||
DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2));
|
||||
DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf(
|
||||
sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2));
|
||||
DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf(
|
||||
sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2));
|
||||
DeviceMem d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf(
|
||||
sizeof(DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2));
|
||||
DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf(
|
||||
sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W));
|
||||
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc);
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice(
|
||||
&b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice(
|
||||
&c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
|
||||
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.ToDevice(
|
||||
&d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc);
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice(
|
||||
&c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
|
||||
if(has_main_e0_block_loop)
|
||||
{
|
||||
|
||||
const auto kernel = kernel_gemm_dlops_v3_resize_add<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
true,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_d_grid,
|
||||
cast_pointer_to_constant_address_space(
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto kernel = kernel_gemm_dlops_v3_resize_add<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
false,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_d_grid,
|
||||
cast_pointer_to_constant_address_space(
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
|
||||
}
|
||||
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
|
||||
{
|
||||
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
|
||||
|
||||
const auto kernel = kernel_gemm_dlops_v3_resize_add<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
has_main_e0_block_loop,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_d_grid);
|
||||
}
|
||||
#endif
|
||||
return ave_time;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -0,0 +1,500 @@
|
||||
#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
|
||||
#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "gridwise_gemm_dlops_v3.hpp"
|
||||
|
||||
template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::index_t E1_,
|
||||
ck::index_t E2_,
|
||||
ck::index_t K2_,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t HoPerBlock,
|
||||
ck::index_t WoPerBlock,
|
||||
ck::index_t E1PerBlock,
|
||||
ck::index_t KPerThread,
|
||||
ck::index_t HoPerThread,
|
||||
ck::index_t WoPerThread,
|
||||
ck::index_t EPerThread,
|
||||
typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector_E2,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_E2,
|
||||
ck::index_t BThreadTransferSrcScalarPerVector_E2,
|
||||
ck::index_t CThreadTransferDstScalarPerVector_K,
|
||||
ck::ActivTypeEnum_t activ_type>
|
||||
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
|
||||
{
|
||||
template <typename... Wei,
|
||||
typename... In,
|
||||
typename... Out,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
__host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
|
||||
const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
|
||||
const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const FloatAB* __restrict__ p_a_grid,
|
||||
const FloatAB* __restrict__ p_b_grid,
|
||||
const FloatC* __restrict__ p_bias_grid,
|
||||
FloatC* __restrict__ p_c_grid,
|
||||
const int nrepeat) const
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
const auto N = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
|
||||
const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
|
||||
const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
|
||||
const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
|
||||
// const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
|
||||
|
||||
const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
|
||||
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
|
||||
const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
|
||||
const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
|
||||
|
||||
const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
|
||||
const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
|
||||
const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
|
||||
|
||||
const auto ConvStrideH = conv_strides[I0];
|
||||
const auto ConvStrideW = conv_strides[I1];
|
||||
|
||||
const auto ConvDilationH = conv_dilations[I0];
|
||||
const auto ConvDilationW = conv_dilations[I1];
|
||||
|
||||
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
|
||||
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
|
||||
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
|
||||
#else
|
||||
const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
|
||||
const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
|
||||
#endif
|
||||
|
||||
const auto OutRightPadH = Hop - Ho;
|
||||
const auto OutRightPadW = Wop - Wo;
|
||||
|
||||
const auto InLeftPadH = in_left_pads[I0];
|
||||
const auto InLeftPadW = in_left_pads[I1];
|
||||
|
||||
const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
|
||||
const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
|
||||
|
||||
const auto E = C0 * Y * X;
|
||||
|
||||
constexpr auto E1 = Number<E1_>{};
|
||||
constexpr auto E2 = Number<E2_>{};
|
||||
constexpr auto K2 = Number<K2_>{};
|
||||
|
||||
const auto E0 = E / E1;
|
||||
|
||||
// weight tensor
|
||||
const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
|
||||
make_tuple(make_pass_through_transform(K),
|
||||
make_pass_through_transform(C0 * Y * X),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
|
||||
|
||||
const auto a_e0_e1_k_e2_grid_desc =
|
||||
transform_tensor_descriptor(a_e_k_e2_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
|
||||
make_pass_through_transform(K),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
// input tensor
|
||||
const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_pass_through_transform(C0),
|
||||
make_pad_transform(Hi, InLeftPadH, InRightPadH),
|
||||
make_pad_transform(Wi, InLeftPadW, InRightPadW),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
|
||||
|
||||
const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
|
||||
in_n_c0_hip_wip_e2_global_desc,
|
||||
make_tuple(
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(C0),
|
||||
make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
|
||||
make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(
|
||||
Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
|
||||
|
||||
const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
|
||||
in_n_c0_y_ho_x_wo_e2_global_desc,
|
||||
make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Hop),
|
||||
make_pass_through_transform(Wop),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(
|
||||
Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
|
||||
|
||||
const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
|
||||
in_e_n_ho_wo_e2_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Hop),
|
||||
make_pass_through_transform(Wop),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(
|
||||
Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
|
||||
|
||||
// output tensor
|
||||
const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
|
||||
make_tuple(make_merge_transform(make_tuple(K0, K1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pad_transform(Ho, I0, OutRightPadH),
|
||||
make_pad_transform(Wo, I0, OutRightPadW)),
|
||||
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
|
||||
|
||||
if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
|
||||
(E1 % E1PerBlock) == 0))
|
||||
{
|
||||
throw std::runtime_error("wrong! GEMM size no divisible");
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
|
||||
// hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
|
||||
constexpr auto a_e0_e1_k_e2_global_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
|
||||
|
||||
// hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
|
||||
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks =
|
||||
make_tuple(
|
||||
make_tuple(
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
|
||||
);
|
||||
|
||||
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
|
||||
|
||||
// hack to control index calculation when iterating over c_k0_k1_n_h0_h1_h2_w0_w1_w2_global tensor
|
||||
constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
// clang-format on
|
||||
|
||||
// GEMM
|
||||
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
|
||||
BlockSize,
|
||||
FloatAB,
|
||||
FloatAcc,
|
||||
FloatC,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
decltype(a_e0_e1_k_e2_grid_desc),
|
||||
decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
|
||||
decltype(c_k_n_hop_wop_grid_desc),
|
||||
decltype(c_k_n_hop_wop_grid_desc),
|
||||
E1,
|
||||
E2,
|
||||
K2,
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
E1PerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
Sequence<2, 3, 0, 1, 4>,
|
||||
Sequence<0, 1, 2, 3, 4>,
|
||||
4,
|
||||
ABlockTransferSrcScalarPerVector_E2,
|
||||
ABlockTransferDstScalarPerVector_E2,
|
||||
false, // don't move back src coordinate after threadwise copy
|
||||
Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
|
||||
9,
|
||||
BThreadTransferSrcScalarPerVector_E2,
|
||||
false, // don't move back src coordinate after threadwise copy, which will be fused with
|
||||
// MoveSrcSliceWindow() to save addr computation
|
||||
Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, H2, W0, W1, W2
|
||||
1,
|
||||
CThreadTransferDstScalarPerVector_K,
|
||||
decltype(a_e0_e1_k_e2_global_step_hacks),
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
|
||||
decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
|
||||
decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
|
||||
decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
|
||||
|
||||
const auto a_e0_e1_k0_k1_e2_grid_desc =
|
||||
GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
|
||||
const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
|
||||
GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
|
||||
const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
|
||||
GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
|
||||
|
||||
using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
|
||||
using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
|
||||
using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
|
||||
|
||||
const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
|
||||
|
||||
const bool has_main_e0_block_loop = E0 > 1;
|
||||
|
||||
std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
|
||||
|
||||
const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
|
||||
GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
|
||||
|
||||
using CBlockIdToBlockClusterAdaptor_K_N_H_W =
|
||||
decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
|
||||
float ave_time = 0;
|
||||
|
||||
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
|
||||
|
||||
if(has_main_e0_block_loop)
|
||||
{
|
||||
const auto kernel =
|
||||
kernel_gemm_dlops_v3<GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
true,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
a_e0_e1_k0_k1_e2_grid_desc,
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto kernel =
|
||||
kernel_gemm_dlops_v3<GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
false,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
a_e0_e1_k0_k1_e2_grid_desc,
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
}
|
||||
|
||||
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
|
||||
DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2));
|
||||
DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf(
|
||||
sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2));
|
||||
DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf(
|
||||
sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2));
|
||||
DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf(
|
||||
sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W));
|
||||
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc);
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice(
|
||||
&b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice(
|
||||
&c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice(
|
||||
&c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
|
||||
if(has_main_e0_block_loop)
|
||||
{
|
||||
|
||||
const auto kernel =
|
||||
kernel_gemm_dlops_v3<GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
true,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
cast_pointer_to_constant_address_space(
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
const auto kernel =
|
||||
kernel_gemm_dlops_v3<GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
false,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
cast_pointer_to_constant_address_space(
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
|
||||
}
|
||||
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
|
||||
{
|
||||
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
|
||||
|
||||
const auto kernel =
|
||||
kernel_gemm_dlops_v3<GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
has_main_e0_block_loop,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid);
|
||||
}
|
||||
#endif
|
||||
return ave_time;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -1,349 +0,0 @@
|
||||
#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
|
||||
#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "gridwise_gemm_dlops_v2.hpp"
|
||||
#include "gridwise_operation_wrapper.hpp"
|
||||
|
||||
template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t HoPerBlock,
|
||||
ck::index_t WoPerBlock,
|
||||
ck::index_t EPerBlock,
|
||||
ck::index_t KPerThread,
|
||||
ck::index_t HoPerThread,
|
||||
ck::index_t WoPerThread,
|
||||
ck::index_t EPerThread,
|
||||
typename ABlockTransferThreadSliceLengths_E_K,
|
||||
typename ABlockTransferThreadClusterLengths_E_K,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector_E,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K,
|
||||
ck::index_t BThreadTransferSrcScalarPerVector_W,
|
||||
ck::index_t CThreadTransferDstScalarPerVector_W>
|
||||
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
|
||||
{
|
||||
template <typename... Wei,
|
||||
typename... In,
|
||||
typename... Out,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
__host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
|
||||
const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
|
||||
const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const FloatAB* __restrict__ p_wei_global,
|
||||
const FloatAB* __restrict__ p_in_global,
|
||||
FloatC* __restrict__ p_out_global) const
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
|
||||
const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
|
||||
const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
|
||||
|
||||
const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
|
||||
const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
|
||||
|
||||
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
|
||||
const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
|
||||
|
||||
const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
|
||||
|
||||
const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
|
||||
const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
|
||||
const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
|
||||
|
||||
const auto ConvStrideH = conv_strides[I0];
|
||||
const auto ConvStrideW = conv_strides[I1];
|
||||
|
||||
const auto ConvDilationH = conv_dilations[I0];
|
||||
const auto ConvDilationW = conv_dilations[I1];
|
||||
|
||||
const auto InLeftPadH = in_left_pads[I0];
|
||||
const auto InLeftPadW = in_left_pads[I1];
|
||||
|
||||
const auto InRightPadH = in_right_pads[I0];
|
||||
const auto InRightPadW = in_right_pads[I1];
|
||||
|
||||
// weight tensor
|
||||
const auto wei_e_k_global_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
|
||||
make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}));
|
||||
|
||||
// input tensor
|
||||
const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
|
||||
in_n_c_hi_wi_global_desc,
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_pass_through_transform(C),
|
||||
make_pad_transform(Hi, InLeftPadH, InRightPadH),
|
||||
make_pad_transform(Wi, InLeftPadW, InRightPadW)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
|
||||
in_n_c_hip_wip_global_desc,
|
||||
make_tuple(
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(C),
|
||||
make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
|
||||
make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
|
||||
|
||||
const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
|
||||
in_n_c_y_ho_x_wo_global_desc,
|
||||
make_tuple(make_merge_transform(make_tuple(C, Y, X)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Ho),
|
||||
make_pass_through_transform(Wo)),
|
||||
make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
// output tensor
|
||||
const auto out_k_n_ho_wo_global_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
|
||||
make_tuple(make_merge_transform(make_tuple(K0, K1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Ho),
|
||||
make_pass_through_transform(Wo)),
|
||||
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
const auto E = C * Y * X;
|
||||
|
||||
if(!((K % KPerBlock) == 0 && (Ho % HoPerBlock) == 0 && (Wo % WoPerBlock) == 0 &&
|
||||
(E % EPerBlock) == 0))
|
||||
{
|
||||
throw std::runtime_error("wrong! GEMM size no divisible");
|
||||
}
|
||||
|
||||
// hack to control index calculation when iterating over a_k_m_global tensor
|
||||
constexpr auto a_e_k_global_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
|
||||
|
||||
constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
|
||||
|
||||
constexpr auto b_e_n_ho_wo_global_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
|
||||
|
||||
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
|
||||
// hack for NKHW format
|
||||
constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 2, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{}));
|
||||
|
||||
#if 1
|
||||
// GEMM
|
||||
using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
|
||||
BlockSize,
|
||||
FloatAB,
|
||||
FloatAcc,
|
||||
FloatC,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
decltype(wei_e_k_global_desc),
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
decltype(out_k_n_ho_wo_global_desc),
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
EPerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E_K,
|
||||
ABlockTransferThreadClusterLengths_E_K,
|
||||
Sequence<1, 0>,
|
||||
Sequence<1, 0>,
|
||||
0,
|
||||
ABlockTransferSrcScalarPerVector_E,
|
||||
ABlockTransferDstScalarPerVector_K,
|
||||
false, // don't move back src coordinate after threadwise copy
|
||||
Sequence<0, 2, 3, 1>,
|
||||
3,
|
||||
BThreadTransferSrcScalarPerVector_W,
|
||||
false, // don't move back src coordinate after threadwise copy, which will be fused with
|
||||
// MoveSrcSliceWindow() to save addr computation
|
||||
Sequence<0, 2, 3, 1>,
|
||||
0,
|
||||
CThreadTransferDstScalarPerVector_W,
|
||||
decltype(a_e_k_global_step_hacks),
|
||||
decltype(b_e_n_ho_wo_global_step_hacks),
|
||||
decltype(c_k_n_ho_wo_global_tensor_step_hacks),
|
||||
decltype(a_e_k_global_move_slice_window_step_hack),
|
||||
decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
|
||||
|
||||
const auto GridSize = (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock) * N;
|
||||
|
||||
const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
|
||||
|
||||
const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
|
||||
|
||||
index_t nrepeat = 100;
|
||||
|
||||
for(index_t i = 0; i < 5; ++i)
|
||||
{
|
||||
std::cout << "Start running " << nrepeat << " times..." << std::endl;
|
||||
|
||||
KernelTimer timer;
|
||||
timer.Start();
|
||||
std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
|
||||
<< " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
|
||||
<< std::endl;
|
||||
|
||||
for(index_t j = 0; j < nrepeat; ++j)
|
||||
{
|
||||
if(has_main_k_block_loop && has_double_tail_k_block_loop)
|
||||
{
|
||||
const auto kernel = run_gridwise_operation<gridwise_gemm,
|
||||
decltype(wei_e_k_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(out_k_n_ho_wo_global_desc),
|
||||
FloatC*,
|
||||
integral_constant<bool, true>,
|
||||
integral_constant<bool, true>>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
wei_e_k_global_desc,
|
||||
p_wei_global,
|
||||
in_e_n_ho_wo_global_desc,
|
||||
p_in_global,
|
||||
out_k_n_ho_wo_global_desc,
|
||||
p_out_global,
|
||||
integral_constant<bool, true>{},
|
||||
integral_constant<bool, true>{});
|
||||
}
|
||||
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
|
||||
{
|
||||
const auto kernel = run_gridwise_operation<gridwise_gemm,
|
||||
decltype(wei_e_k_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(out_k_n_ho_wo_global_desc),
|
||||
FloatC*,
|
||||
integral_constant<bool, true>,
|
||||
integral_constant<bool, false>>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
wei_e_k_global_desc,
|
||||
p_wei_global,
|
||||
in_e_n_ho_wo_global_desc,
|
||||
p_in_global,
|
||||
out_k_n_ho_wo_global_desc,
|
||||
p_out_global,
|
||||
integral_constant<bool, true>{},
|
||||
integral_constant<bool, false>{});
|
||||
}
|
||||
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
|
||||
{
|
||||
const auto kernel = run_gridwise_operation<gridwise_gemm,
|
||||
decltype(wei_e_k_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(out_k_n_ho_wo_global_desc),
|
||||
FloatC*,
|
||||
integral_constant<bool, false>,
|
||||
integral_constant<bool, true>>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
wei_e_k_global_desc,
|
||||
p_wei_global,
|
||||
in_e_n_ho_wo_global_desc,
|
||||
p_in_global,
|
||||
out_k_n_ho_wo_global_desc,
|
||||
p_out_global,
|
||||
integral_constant<bool, false>{},
|
||||
integral_constant<bool, true>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto kernel = run_gridwise_operation<gridwise_gemm,
|
||||
decltype(wei_e_k_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(out_k_n_ho_wo_global_desc),
|
||||
FloatC*,
|
||||
integral_constant<bool, false>,
|
||||
integral_constant<bool, false>>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
wei_e_k_global_desc,
|
||||
p_wei_global,
|
||||
in_e_n_ho_wo_global_desc,
|
||||
p_in_global,
|
||||
out_k_n_ho_wo_global_desc,
|
||||
p_out_global,
|
||||
integral_constant<bool, false>{},
|
||||
integral_constant<bool, false>{});
|
||||
}
|
||||
}
|
||||
|
||||
timer.End();
|
||||
|
||||
float ave_time = timer.GetElapsedTime() / nrepeat;
|
||||
|
||||
float perf =
|
||||
static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
|
||||
wei_k_c_y_x_global_desc,
|
||||
out_n_k0_ho_wo_k1_global_desc)) /
|
||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||
|
||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||
<< std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -1,364 +0,0 @@
|
||||
#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
|
||||
#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "gridwise_gemm_dlops_v2.hpp"
|
||||
#include "gridwise_operation_wrapper.hpp"
|
||||
|
||||
template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t HoPerBlock,
|
||||
ck::index_t WoPerBlock,
|
||||
ck::index_t EPerBlock,
|
||||
ck::index_t KPerThread,
|
||||
ck::index_t HoPerThread,
|
||||
ck::index_t WoPerThread,
|
||||
ck::index_t EPerThread,
|
||||
typename ABlockTransferThreadSliceLengths_E_K,
|
||||
typename ABlockTransferThreadClusterLengths_E_K,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector_E,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_K,
|
||||
ck::index_t BThreadTransferSrcScalarPerVector_W,
|
||||
ck::index_t CThreadTransferDstScalarPerVector_W>
|
||||
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
|
||||
{
|
||||
template <typename... Wei,
|
||||
typename... In,
|
||||
typename... Out,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
__host__ void Run(const ck::TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
|
||||
const ck::TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
|
||||
const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const FloatAB* __restrict__ p_wei_global,
|
||||
const FloatAB* __restrict__ p_in_global,
|
||||
FloatC* __restrict__ p_out_global) const
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
|
||||
const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
|
||||
const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
|
||||
|
||||
const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
|
||||
const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
|
||||
|
||||
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
|
||||
const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
|
||||
|
||||
const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
|
||||
|
||||
const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
|
||||
const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
|
||||
const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
|
||||
|
||||
const auto ConvStrideH = conv_strides[I0];
|
||||
const auto ConvStrideW = conv_strides[I1];
|
||||
|
||||
const auto ConvDilationH = conv_dilations[I0];
|
||||
const auto ConvDilationW = conv_dilations[I1];
|
||||
|
||||
const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
|
||||
const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
|
||||
|
||||
const auto OutRightPadH = Hop - Ho;
|
||||
const auto OutRightPadW = Wop - Wo;
|
||||
|
||||
const auto InLeftPadH = in_left_pads[I0];
|
||||
const auto InLeftPadW = in_left_pads[I1];
|
||||
|
||||
const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
|
||||
const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
|
||||
|
||||
std::cerr << "OutRightPadH = " << OutRightPadH << " OutRightPadW = " << OutRightPadW
|
||||
<< std::endl;
|
||||
std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
|
||||
<< std::endl;
|
||||
|
||||
// weight tensor
|
||||
const auto wei_e_k_global_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
|
||||
make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}));
|
||||
|
||||
// input tensor
|
||||
const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
|
||||
in_n_c_hi_wi_global_desc,
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_pass_through_transform(C),
|
||||
make_pad_transform(Hi, InLeftPadH, InRightPadH),
|
||||
make_pad_transform(Wi, InLeftPadW, InRightPadW)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
|
||||
in_n_c_hip_wip_global_desc,
|
||||
make_tuple(
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(C),
|
||||
make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
|
||||
make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW))),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
|
||||
|
||||
const auto in_e_n_ho_wo_global_desc = transform_tensor_descriptor(
|
||||
in_n_c_y_ho_x_wo_global_desc,
|
||||
make_tuple(make_merge_transform(make_tuple(C, Y, X)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Hop),
|
||||
make_pass_through_transform(Wop)),
|
||||
make_tuple(Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
// output tensor
|
||||
const auto out_k_n_hop_wop_global_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
|
||||
make_tuple(make_merge_transform(make_tuple(K0, K1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pad_transform(Ho, 0, OutRightPadH),
|
||||
make_pad_transform(Wo, 0, OutRightPadW)),
|
||||
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
const auto E = C * Y * X;
|
||||
|
||||
std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
|
||||
|
||||
if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
|
||||
(E % EPerBlock) == 0))
|
||||
{
|
||||
throw std::runtime_error("wrong! GEMM size no divisible");
|
||||
}
|
||||
|
||||
// hack to control index calculation when iterating over a_k_m_global tensor
|
||||
constexpr auto a_e_k_global_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 0, 0>{}, Sequence<0, 0, 0>{}));
|
||||
|
||||
constexpr auto a_e_k_global_move_slice_window_step_hack = Sequence<0, 0, 0>{};
|
||||
|
||||
constexpr auto b_e_n_ho_wo_global_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
|
||||
|
||||
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
|
||||
// hack for NKHW format
|
||||
constexpr auto c_k_n_ho_wo_global_tensor_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 2, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0>{}));
|
||||
|
||||
// GEMM
|
||||
using gridwise_gemm = GridwiseGemmDlops_km_kn_mn_v3<
|
||||
BlockSize,
|
||||
FloatAB,
|
||||
FloatAcc,
|
||||
FloatC,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
decltype(wei_e_k_global_desc),
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
decltype(out_k_n_hop_wop_global_desc),
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
EPerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E_K,
|
||||
ABlockTransferThreadClusterLengths_E_K,
|
||||
Sequence<1, 0>,
|
||||
Sequence<1, 0>,
|
||||
0,
|
||||
ABlockTransferSrcScalarPerVector_E,
|
||||
ABlockTransferDstScalarPerVector_K,
|
||||
false, // don't move back src coordinate after threadwise copy
|
||||
Sequence<0, 2, 3, 1>,
|
||||
3,
|
||||
BThreadTransferSrcScalarPerVector_W,
|
||||
false, // don't move back src coordinate after threadwise copy, which will be fused with
|
||||
// MoveSrcSliceWindow() to save addr computation
|
||||
Sequence<0, 2, 3, 1>,
|
||||
0,
|
||||
CThreadTransferDstScalarPerVector_W,
|
||||
decltype(a_e_k_global_step_hacks),
|
||||
decltype(b_e_n_ho_wo_global_step_hacks),
|
||||
decltype(c_k_n_ho_wo_global_tensor_step_hacks),
|
||||
decltype(a_e_k_global_move_slice_window_step_hack),
|
||||
decltype(b_e_n_ho_wo_global_move_slice_window_step_hack)>;
|
||||
|
||||
const auto GridSize = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
|
||||
|
||||
const bool has_main_k_block_loop = (E + EPerBlock) / (2 * EPerBlock) > 1;
|
||||
|
||||
const bool has_double_tail_k_block_loop = (E / EPerBlock) % 2 == 0;
|
||||
|
||||
index_t nrepeat = 100;
|
||||
|
||||
for(index_t i = 0; i < 5; ++i)
|
||||
{
|
||||
std::cout << "Start running " << nrepeat << " times..." << std::endl;
|
||||
|
||||
KernelTimer timer;
|
||||
timer.Start();
|
||||
std::cout << "has_main_k_block_loop: " << has_main_k_block_loop
|
||||
<< " has_double_tail_k_block_loop: " << has_double_tail_k_block_loop
|
||||
<< std::endl;
|
||||
|
||||
for(index_t j = 0; j < nrepeat; ++j)
|
||||
{
|
||||
if(has_main_k_block_loop && has_double_tail_k_block_loop)
|
||||
{
|
||||
const auto kernel =
|
||||
run_gridwise_operation<gridwise_gemm,
|
||||
decltype(wei_e_k_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(out_k_n_hop_wop_global_desc),
|
||||
FloatC*,
|
||||
integral_constant<bool, true>,
|
||||
integral_constant<bool, true>>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
wei_e_k_global_desc,
|
||||
p_wei_global,
|
||||
in_e_n_ho_wo_global_desc,
|
||||
p_in_global,
|
||||
out_k_n_hop_wop_global_desc,
|
||||
p_out_global,
|
||||
integral_constant<bool, true>{},
|
||||
integral_constant<bool, true>{});
|
||||
}
|
||||
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
|
||||
{
|
||||
const auto kernel =
|
||||
run_gridwise_operation<gridwise_gemm,
|
||||
decltype(wei_e_k_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(out_k_n_hop_wop_global_desc),
|
||||
FloatC*,
|
||||
integral_constant<bool, true>,
|
||||
integral_constant<bool, false>>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
wei_e_k_global_desc,
|
||||
p_wei_global,
|
||||
in_e_n_ho_wo_global_desc,
|
||||
p_in_global,
|
||||
out_k_n_hop_wop_global_desc,
|
||||
p_out_global,
|
||||
integral_constant<bool, true>{},
|
||||
integral_constant<bool, false>{});
|
||||
}
|
||||
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
|
||||
{
|
||||
const auto kernel =
|
||||
run_gridwise_operation<gridwise_gemm,
|
||||
decltype(wei_e_k_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(out_k_n_hop_wop_global_desc),
|
||||
FloatC*,
|
||||
integral_constant<bool, false>,
|
||||
integral_constant<bool, true>>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
wei_e_k_global_desc,
|
||||
p_wei_global,
|
||||
in_e_n_ho_wo_global_desc,
|
||||
p_in_global,
|
||||
out_k_n_hop_wop_global_desc,
|
||||
p_out_global,
|
||||
integral_constant<bool, false>{},
|
||||
integral_constant<bool, true>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto kernel =
|
||||
run_gridwise_operation<gridwise_gemm,
|
||||
decltype(wei_e_k_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(in_e_n_ho_wo_global_desc),
|
||||
const FloatAB*,
|
||||
decltype(out_k_n_hop_wop_global_desc),
|
||||
FloatC*,
|
||||
integral_constant<bool, false>,
|
||||
integral_constant<bool, false>>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
wei_e_k_global_desc,
|
||||
p_wei_global,
|
||||
in_e_n_ho_wo_global_desc,
|
||||
p_in_global,
|
||||
out_k_n_hop_wop_global_desc,
|
||||
p_out_global,
|
||||
integral_constant<bool, false>{},
|
||||
integral_constant<bool, false>{});
|
||||
}
|
||||
}
|
||||
|
||||
timer.End();
|
||||
|
||||
float ave_time = timer.GetElapsedTime() / nrepeat;
|
||||
|
||||
float perf =
|
||||
static_cast<float>(calculate_convolution_flops(in_n_c_hi_wi_global_desc,
|
||||
wei_k_c_y_x_global_desc,
|
||||
out_n_k0_ho_wo_k1_global_desc)) /
|
||||
(std::size_t(1000) * 1000 * 1000) / ave_time;
|
||||
|
||||
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -0,0 +1,569 @@
|
||||
#ifndef DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
|
||||
#define DRIVER_CONVOLUTION_MAXPOOL_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NC0HWc1_KC0YXC1_NK0HWK1_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "gridwise_gemm_dlops_v3.hpp"
|
||||
|
||||
template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::index_t E1_,
|
||||
ck::index_t E2_,
|
||||
ck::index_t K2_,
|
||||
ck::index_t KPerBlock,
|
||||
ck::index_t HoPerBlock,
|
||||
ck::index_t WoPerBlock,
|
||||
ck::index_t E1PerBlock,
|
||||
ck::index_t KPerThread,
|
||||
ck::index_t HoPerThread,
|
||||
ck::index_t WoPerThread,
|
||||
ck::index_t EPerThread,
|
||||
typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
ck::index_t ABlockTransferSrcScalarPerVector_E2,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_E2,
|
||||
ck::index_t BThreadTransferSrcScalarPerVector_E2,
|
||||
ck::index_t CThreadTransferDstScalarPerVector_K,
|
||||
ck::ActivTypeEnum_t activ_type>
|
||||
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
|
||||
{
|
||||
template <typename... Wei,
|
||||
typename... In,
|
||||
typename... MaxPool,
|
||||
typename... Out,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
__host__ float Run(const ck::TensorDescriptor<Wei...>& wei_k_c0_y_x_c1_global_desc,
|
||||
const ck::TensorDescriptor<In...>& in_n_c0_hi_wi_c1_global_desc,
|
||||
const ck::TensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
|
||||
const ck::TensorDescriptor<MaxPool...>& max_n_k0_hx_wx_k1_global_desc,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const FloatAB* __restrict__ p_a_grid,
|
||||
const FloatAB* __restrict__ p_b_grid,
|
||||
const FloatC* __restrict__ p_bias_grid,
|
||||
FloatC* __restrict__ p_c_grid,
|
||||
FloatC* __restrict__ p_d_grid,
|
||||
const int nrepeat) const
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
|
||||
const auto N = in_n_c0_hi_wi_c1_global_desc.GetLength(I0);
|
||||
const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
|
||||
const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
|
||||
const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
|
||||
// const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
|
||||
|
||||
const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
|
||||
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
|
||||
const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
|
||||
const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
|
||||
|
||||
const auto Hx = max_n_k0_hx_wx_k1_global_desc.GetLength(I2);
|
||||
const auto Wx = max_n_k0_hx_wx_k1_global_desc.GetLength(I3);
|
||||
|
||||
const auto K = wei_k_c0_y_x_c1_global_desc.GetLength(I0);
|
||||
const auto Y = wei_k_c0_y_x_c1_global_desc.GetLength(I2);
|
||||
const auto X = wei_k_c0_y_x_c1_global_desc.GetLength(I3);
|
||||
|
||||
const auto ConvStrideH = conv_strides[I0];
|
||||
const auto ConvStrideW = conv_strides[I1];
|
||||
|
||||
const auto ConvDilationH = conv_dilations[I0];
|
||||
const auto ConvDilationW = conv_dilations[I1];
|
||||
|
||||
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
|
||||
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
|
||||
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
|
||||
|
||||
const auto OutRightPadH = Hop - Ho;
|
||||
const auto OutRightPadW = Wop - Wo;
|
||||
|
||||
const auto OutRightPadHx = Number<OutRightPadH / 2>{};
|
||||
const auto OutRightPadWx = Number<OutRightPadW / 2>{};
|
||||
#else
|
||||
const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
|
||||
const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
|
||||
|
||||
const auto OutRightPadH = Hop - Ho;
|
||||
const auto OutRightPadW = Wop - Wo;
|
||||
|
||||
const auto OutRightPadHx = OutRightPadH / 2;
|
||||
const auto OutRightPadWx = OutRightPadW / 2;
|
||||
#endif
|
||||
|
||||
const auto InLeftPadH = in_left_pads[I0];
|
||||
const auto InLeftPadW = in_left_pads[I1];
|
||||
|
||||
const auto InRightPadH = in_right_pads[I0] + OutRightPadH * ConvStrideH;
|
||||
const auto InRightPadW = in_right_pads[I1] + OutRightPadW * ConvStrideW;
|
||||
|
||||
const auto E = C0 * Y * X;
|
||||
|
||||
constexpr auto E1 = Number<E1_>{};
|
||||
constexpr auto E2 = Number<E2_>{};
|
||||
constexpr auto K2 = Number<K2_>{};
|
||||
|
||||
const auto E0 = E / E1;
|
||||
|
||||
// weight tensor
|
||||
const auto a_e_k_e2_grid_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
|
||||
make_tuple(make_pass_through_transform(K),
|
||||
make_pass_through_transform(C0 * Y * X),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
|
||||
|
||||
const auto a_e0_e1_k_e2_grid_desc =
|
||||
transform_tensor_descriptor(a_e_k_e2_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
|
||||
make_pass_through_transform(K),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
|
||||
make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
// input tensor
|
||||
const auto in_n_c0_hip_wip_e2_global_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2)),
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_pass_through_transform(C0),
|
||||
make_pad_transform(Hi, InLeftPadH, InRightPadH),
|
||||
make_pad_transform(Wi, InLeftPadW, InRightPadW),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
|
||||
|
||||
const auto in_n_c0_y_ho_x_wo_e2_global_desc = transform_tensor_descriptor(
|
||||
in_n_c0_hip_wip_e2_global_desc,
|
||||
make_tuple(
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(C0),
|
||||
make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
|
||||
make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(
|
||||
Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
|
||||
|
||||
const auto in_e_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
|
||||
in_n_c0_y_ho_x_wo_e2_global_desc,
|
||||
make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Hop),
|
||||
make_pass_through_transform(Wop),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(
|
||||
Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
|
||||
|
||||
const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
|
||||
in_e_n_ho_wo_e2_grid_desc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pass_through_transform(Hop),
|
||||
make_pass_through_transform(Wop),
|
||||
make_pass_through_transform(E2)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
|
||||
make_tuple(
|
||||
Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
|
||||
|
||||
// output tensor
|
||||
const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1)),
|
||||
make_tuple(make_merge_transform(make_tuple(K0, K1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pad_transform(Ho, I0, OutRightPadH),
|
||||
make_pad_transform(Wo, I0, OutRightPadW)),
|
||||
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
// max tensor
|
||||
const auto d_k_n_hx_wx_grid_desc = transform_tensor_descriptor(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hx, Wx, K1)),
|
||||
make_tuple(make_merge_transform(make_tuple(K0, K1)),
|
||||
make_pass_through_transform(N),
|
||||
make_pad_transform(Hx, I0, OutRightPadHx),
|
||||
make_pad_transform(Wx, I0, OutRightPadWx)),
|
||||
make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
|
||||
|
||||
if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
|
||||
(E1 % E1PerBlock) == 0))
|
||||
{
|
||||
throw std::runtime_error("wrong! GEMM size no divisible");
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
|
||||
// hack to control index calculation when iterating over a_e0_e1_k_e2_global tensor
|
||||
constexpr auto a_e0_e1_k_e2_global_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
|
||||
|
||||
// hack to control index calculation when iterating over b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global tensor
|
||||
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks =
|
||||
make_tuple(
|
||||
make_tuple(
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})
|
||||
);
|
||||
|
||||
constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack =
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
|
||||
|
||||
constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks =
|
||||
make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 1, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
|
||||
make_tuple(Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 2, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{},
|
||||
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
|
||||
|
||||
// clang-format on
|
||||
|
||||
// GEMM
|
||||
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
|
||||
BlockSize,
|
||||
FloatAB,
|
||||
FloatAcc,
|
||||
FloatC,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
decltype(a_e0_e1_k_e2_grid_desc),
|
||||
decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
|
||||
decltype(c_k_n_hop_wop_grid_desc),
|
||||
decltype(d_k_n_hx_wx_grid_desc),
|
||||
E1,
|
||||
E2,
|
||||
K2,
|
||||
KPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
E1PerBlock,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
EPerThread,
|
||||
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
|
||||
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
|
||||
Sequence<2, 3, 0, 1, 4>,
|
||||
Sequence<0, 1, 2, 3, 4>,
|
||||
4,
|
||||
ABlockTransferSrcScalarPerVector_E2,
|
||||
ABlockTransferDstScalarPerVector_E2,
|
||||
false, // don't move back src coordinate after threadwise copy
|
||||
Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>, // E0, E1, N, H0, H1, H2, W0, W1, W2, E2
|
||||
9,
|
||||
BThreadTransferSrcScalarPerVector_E2,
|
||||
false, // don't move back src coordinate after threadwise copy, which will be fused
|
||||
// with MoveSrcSliceWindow() to save addr computation
|
||||
Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8>, // K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
|
||||
1,
|
||||
CThreadTransferDstScalarPerVector_K,
|
||||
decltype(a_e0_e1_k_e2_global_step_hacks),
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks),
|
||||
decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks),
|
||||
decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks),
|
||||
decltype(a_e0_e1_k_e2_global_move_slice_window_step_hack),
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack)>;
|
||||
|
||||
const auto a_e0_e1_k0_k1_e2_grid_desc =
|
||||
GridwiseGemm::MakeAE0E1K0K1E2GridDescriptor(a_e0_e1_k_e2_grid_desc);
|
||||
const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
|
||||
GridwiseGemm::MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(b_e0_e1_n_ho_wo_e2_grid_desc);
|
||||
const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
|
||||
GridwiseGemm::MakeCK0K1NH0H1H2W0W1W2GridDescriptor(c_k_n_hop_wop_grid_desc);
|
||||
const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc =
|
||||
GridwiseGemm::MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(d_k_n_hx_wx_grid_desc);
|
||||
|
||||
using AGridDesc_E0_E1_K0_K1_E2 = decltype(a_e0_e1_k0_k1_e2_grid_desc);
|
||||
using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
|
||||
decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
|
||||
using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 = decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
|
||||
using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx = decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
|
||||
|
||||
const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
|
||||
|
||||
const bool has_main_e0_block_loop = E0 > 1;
|
||||
|
||||
std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
|
||||
|
||||
const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
|
||||
GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
|
||||
|
||||
using CBlockIdToBlockClusterAdaptor_K_N_H_W =
|
||||
decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
|
||||
float ave_time = 0;
|
||||
|
||||
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
|
||||
|
||||
if(has_main_e0_block_loop)
|
||||
{
|
||||
const auto kernel = kernel_gemm_dlops_v3_maxpool<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
true,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
p_d_grid,
|
||||
a_e0_e1_k0_k1_e2_grid_desc,
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
|
||||
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto kernel = kernel_gemm_dlops_v3_maxpool<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
false,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
p_d_grid,
|
||||
a_e0_e1_k0_k1_e2_grid_desc,
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
|
||||
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
}
|
||||
|
||||
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
|
||||
DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2));
|
||||
DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf(
|
||||
sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2));
|
||||
DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf(
|
||||
sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2));
|
||||
DeviceMem d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf(
|
||||
sizeof(DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx));
|
||||
DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf(
|
||||
sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W));
|
||||
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc);
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice(
|
||||
&b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice(
|
||||
&c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
|
||||
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.ToDevice(
|
||||
&d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice(
|
||||
&c_blockid_to_k_n_h_w_block_cluster_adaptor);
|
||||
|
||||
if(has_main_e0_block_loop)
|
||||
{
|
||||
|
||||
const auto kernel = kernel_gemm_dlops_v3_maxpool<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
true,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
p_d_grid,
|
||||
cast_pointer_to_constant_address_space(
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
const auto kernel = kernel_gemm_dlops_v3_maxpool<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
false,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
p_d_grid,
|
||||
cast_pointer_to_constant_address_space(
|
||||
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
|
||||
}
|
||||
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
|
||||
{
|
||||
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
|
||||
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
|
||||
|
||||
const auto kernel = kernel_gemm_dlops_v3_maxpool<
|
||||
GridwiseGemm,
|
||||
FloatAB,
|
||||
FloatC,
|
||||
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
|
||||
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
|
||||
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
|
||||
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
|
||||
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
|
||||
has_main_e0_block_loop,
|
||||
activ_type>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
p_a_grid,
|
||||
p_b_grid,
|
||||
p_bias_grid,
|
||||
p_c_grid,
|
||||
p_d_grid);
|
||||
}
|
||||
#endif
|
||||
return ave_time;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
414
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
Normal file
414
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
Normal file
@@ -0,0 +1,414 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
#define USE_DYNAMIC_MODE 0
|
||||
#define USE_CONV_FWD_V5R1_NCHWC 1
|
||||
|
||||
enum ConvForwardAlgo
|
||||
{
|
||||
V5R1NCHWC // 0
|
||||
};
|
||||
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
const Tensor<TOut>& add,
|
||||
const Tensor<TOut>& bias,
|
||||
Tensor<TOut>& add_host,
|
||||
Tensor<TOut>& out_host,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&,
|
||||
const ck::ActivTypeEnum_t activ_type)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
|
||||
double v = 0;
|
||||
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
|
||||
|
||||
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
|
||||
{
|
||||
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[3])
|
||||
{
|
||||
|
||||
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
|
||||
{
|
||||
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
|
||||
static_cast<const double>(wei(k, c0, y, x, c1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
v += bias(k0, k1);
|
||||
v = activ(v, activ_type);
|
||||
|
||||
const int hox2 = ho * 2;
|
||||
const int wox2 = wo * 2;
|
||||
|
||||
out_host(n, k0, ho, wo, k1) = v;
|
||||
|
||||
add_host(n, k0, hox2, wox2, k1) = v + add(n, k0, hox2, wox2, k1);
|
||||
add_host(n, k0, hox2, wox2 + 1, k1) = v + add(n, k0, hox2, wox2 + 1, k1);
|
||||
add_host(n, k0, hox2 + 1, wox2, k1) = v + add(n, k0, hox2 + 1, wox2, k1);
|
||||
add_host(n, k0, hox2 + 1, wox2 + 1, k1) = v + add(n, k0, hox2 + 1, wox2 + 1, k1);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
out_host.mDesc.GetLengths()[0],
|
||||
out_host.mDesc.GetLengths()[1],
|
||||
out_host.mDesc.GetLengths()[2],
|
||||
out_host.mDesc.GetLengths()[3],
|
||||
out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
#if USE_DYNAMIC_MODE
|
||||
// dynamic mode
|
||||
if(argc != 23)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
const index_t N = std::stoi(argv[6]);
|
||||
const index_t K0 = std::stoi(argv[7]);
|
||||
const index_t K1 = std::stoi(argv[8]);
|
||||
const index_t C0 = std::stoi(argv[9]);
|
||||
const index_t C1 = std::stoi(argv[10]);
|
||||
const index_t Y = std::stoi(argv[11]);
|
||||
const index_t X = std::stoi(argv[12]);
|
||||
const index_t Hi = std::stoi(argv[13]);
|
||||
const index_t Wi = std::stoi(argv[14]);
|
||||
|
||||
const index_t conv_stride_h = std::stoi(argv[15]);
|
||||
const index_t conv_stride_w = std::stoi(argv[16]);
|
||||
const index_t conv_dilation_h = std::stoi(argv[17]);
|
||||
const index_t conv_dilation_w = std::stoi(argv[18]);
|
||||
const index_t in_left_pad_h = std::stoi(argv[19]);
|
||||
const index_t in_left_pad_w = std::stoi(argv[20]);
|
||||
const index_t in_right_pad_h = std::stoi(argv[21]);
|
||||
const index_t in_right_pad_w = std::stoi(argv[22]);
|
||||
|
||||
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
|
||||
const index_t XEff = (X - 1) * conv_dilation_w + 1;
|
||||
|
||||
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
|
||||
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
|
||||
const auto Hox2 = Ho * 2;
|
||||
const auto Wox2 = Wo * 2;
|
||||
#else
|
||||
// static mode
|
||||
if(argc < 6)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
#if 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
constexpr auto K0 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<540>{};
|
||||
constexpr auto Wi = Number<960>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<270>{};
|
||||
constexpr auto Wi = Number<480>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 1
|
||||
constexpr auto N = Number<128>{};
|
||||
constexpr auto Hi = Number<135>{};
|
||||
constexpr auto Wi = Number<240>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 1
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<32>{};
|
||||
constexpr auto Wi = Number<32>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
constexpr auto K0 = Number<8>{};
|
||||
#endif
|
||||
|
||||
constexpr auto conv_stride_h = I1;
|
||||
constexpr auto conv_stride_w = I1;
|
||||
constexpr auto conv_dilation_h = I1;
|
||||
constexpr auto conv_dilation_w = I1;
|
||||
constexpr auto in_left_pad_h = I1;
|
||||
constexpr auto in_left_pad_w = I1;
|
||||
constexpr auto in_right_pad_h = I1;
|
||||
constexpr auto in_right_pad_w = I1;
|
||||
|
||||
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
|
||||
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
|
||||
|
||||
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
|
||||
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
|
||||
|
||||
constexpr auto Hox2 = Number<Ho * 2>{};
|
||||
constexpr auto Wox2 = Number<Wo * 2>{};
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
#elif 1
|
||||
using in_data_t = half_t;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = half_t;
|
||||
#elif 1
|
||||
using in_data_t = int8_t;
|
||||
using acc_data_t = int32_t;
|
||||
using out_data_t = int8_t;
|
||||
#endif
|
||||
|
||||
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
|
||||
add_lengths_host(5), bias_lengths_host(2);
|
||||
|
||||
in_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
in_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
in_lengths_host[2] = static_cast<std::size_t>(Hi);
|
||||
in_lengths_host[3] = static_cast<std::size_t>(Wi);
|
||||
in_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
|
||||
wei_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
wei_lengths_host[2] = static_cast<std::size_t>(Y);
|
||||
wei_lengths_host[3] = static_cast<std::size_t>(X);
|
||||
wei_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
out_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
out_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
out_lengths_host[2] = static_cast<std::size_t>(Ho);
|
||||
out_lengths_host[3] = static_cast<std::size_t>(Wo);
|
||||
out_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
add_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
add_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
add_lengths_host[2] = static_cast<std::size_t>(Hox2);
|
||||
add_lengths_host[3] = static_cast<std::size_t>(Wox2);
|
||||
add_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
bias_lengths_host[0] = static_cast<std::size_t>(K0);
|
||||
bias_lengths_host[1] = static_cast<std::size_t>(K1);
|
||||
|
||||
Tensor<in_data_t> in(in_lengths_host);
|
||||
Tensor<in_data_t> wei(wei_lengths_host);
|
||||
Tensor<in_data_t> add(add_lengths_host);
|
||||
Tensor<in_data_t> add_device(add_lengths_host);
|
||||
Tensor<in_data_t> add_host(add_lengths_host);
|
||||
Tensor<out_data_t> bias(bias_lengths_host);
|
||||
Tensor<out_data_t> out_host(out_lengths_host);
|
||||
|
||||
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
|
||||
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
|
||||
ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
|
||||
|
||||
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
|
||||
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
// no initialization
|
||||
break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 3:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 4:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 5:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
|
||||
|
||||
auto gen_wei = [](auto... is) {
|
||||
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
|
||||
};
|
||||
wei.GenerateTensorValue(gen_wei, num_thread);
|
||||
}
|
||||
|
||||
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
add.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
|
||||
auto f_make_for_device_nchwc = [&]() {
|
||||
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
|
||||
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
|
||||
const auto add_lengths_dev = make_tuple(N, K0, Hox2, Wox2, K1);
|
||||
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
|
||||
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
|
||||
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
|
||||
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
|
||||
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
|
||||
|
||||
return make_tuple(in_lengths_dev,
|
||||
wei_lengths_dev,
|
||||
add_lengths_dev,
|
||||
out_lengths_dev,
|
||||
conv_strides_dev,
|
||||
conv_dilations_dev,
|
||||
in_left_pads_dev,
|
||||
in_right_pads_dev);
|
||||
};
|
||||
|
||||
#if USE_CONV_FWD_V5R1_NCHWC
|
||||
if(algo == ConvForwardAlgo::V5R1NCHWC)
|
||||
{
|
||||
const auto tmp = f_make_for_device_nchwc();
|
||||
|
||||
device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
|
||||
acc_data_t,
|
||||
out_data_t,
|
||||
activ_type>(
|
||||
tmp[I0], // in_lengths_dev
|
||||
tmp[I1], // wei_lengths_dev
|
||||
tmp[I2], // add_lengths_dev
|
||||
tmp[I3], // out_lengths_dev
|
||||
tmp[I4], // conv_strides_dev
|
||||
tmp[I5], // conv_dilations_dev
|
||||
tmp[I6], // in_left_pads_dev
|
||||
tmp[I7], // in_right_pads_dev
|
||||
in,
|
||||
wei,
|
||||
bias,
|
||||
add,
|
||||
add_device,
|
||||
nrepeat);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
host_direct_convolution_add_nchwc(in,
|
||||
wei,
|
||||
add,
|
||||
bias,
|
||||
add_host,
|
||||
out_host,
|
||||
make_tuple(conv_stride_h, conv_stride_w),
|
||||
make_tuple(conv_dilation_h, conv_dilation_w),
|
||||
make_tuple(in_left_pad_h, in_left_pad_w),
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(add_host, add_device);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "add_device: ", add_device.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,17 +15,15 @@
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
#define USE_DYNAMIC_MODE 1
|
||||
#define USE_DYNAMIC_MODE 0
|
||||
#define USE_CONV_FWD_V4R4_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R2_NHWC 0
|
||||
#define USE_CONV_FWD_V6R1_NCHW 0
|
||||
#define USE_CONV_FWD_V5R1_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R2_NHWC 1
|
||||
#define USE_CONV_FWD_V6R1_NCHW 1
|
||||
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
|
||||
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
|
||||
|
||||
enum ConvTensorLayout
|
||||
{
|
||||
@@ -41,9 +39,8 @@ enum ConvForwardAlgo
|
||||
V4R4NCHW, // 0
|
||||
V4R4R2NHWC, // 1
|
||||
V6R1NCHW, // 2
|
||||
V5R1NCHW, // 3
|
||||
V4R4R2XDLNCHW, // 4
|
||||
V4R4R4XDLNHWC // 5
|
||||
V4R4R2XDLNCHW, // 3
|
||||
V4R4R4XDLNHWC // 4
|
||||
};
|
||||
|
||||
template <typename TIn,
|
||||
@@ -237,8 +234,8 @@ int main(int argc, char* argv[])
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
|
||||
constexpr auto conv_stride_h = I2;
|
||||
constexpr auto conv_stride_w = I2;
|
||||
constexpr auto conv_stride_h = I1;
|
||||
constexpr auto conv_stride_w = I1;
|
||||
constexpr auto conv_dilation_h = I1;
|
||||
constexpr auto conv_dilation_w = I1;
|
||||
constexpr auto in_left_pad_h = I1;
|
||||
@@ -253,7 +250,7 @@ int main(int argc, char* argv[])
|
||||
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
@@ -472,33 +469,6 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_CONV_FWD_V5R1_NCHW
|
||||
if(algo == ConvForwardAlgo::V5R1NCHW)
|
||||
{
|
||||
if(layout != ConvTensorLayout::NCHW)
|
||||
{
|
||||
throw std::runtime_error("wrong! layout");
|
||||
}
|
||||
|
||||
const auto tmp = f_make_for_device_nchw();
|
||||
|
||||
device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
|
||||
16,
|
||||
acc_data_t,
|
||||
out_data_t>(tmp[I0],
|
||||
tmp[I1],
|
||||
tmp[I2],
|
||||
tmp[I3],
|
||||
tmp[I4],
|
||||
tmp[I5],
|
||||
tmp[I6],
|
||||
in,
|
||||
wei,
|
||||
out_device,
|
||||
nrepeat);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_CONV_FWD_V4R4R2_XDL_NCHW
|
||||
if(algo == ConvForwardAlgo::V4R4R2XDLNCHW)
|
||||
{
|
||||
|
||||
391
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
Normal file
391
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
Normal file
@@ -0,0 +1,391 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
#define USE_DYNAMIC_MODE 0
|
||||
#define USE_CONV_FWD_V5R1_NCHWC 1
|
||||
|
||||
enum ConvForwardAlgo
|
||||
{
|
||||
V5R1NCHWC // 0
|
||||
};
|
||||
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution_nchwc(const Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
const Tensor<TOut>& bias,
|
||||
Tensor<TOut>& out,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&,
|
||||
const ck::ActivTypeEnum_t activ_type)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
|
||||
double v = 0;
|
||||
const int k = k0 * out.mDesc.GetLengths()[4] + k1;
|
||||
|
||||
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
|
||||
{
|
||||
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[3])
|
||||
{
|
||||
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
|
||||
{
|
||||
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
|
||||
static_cast<const double>(wei(k, c0, y, x, c1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
v += bias(k0, k1);
|
||||
out(n, k0, ho, wo, k1) = activ(v, activ_type);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
out.mDesc.GetLengths()[0],
|
||||
out.mDesc.GetLengths()[1],
|
||||
out.mDesc.GetLengths()[2],
|
||||
out.mDesc.GetLengths()[3],
|
||||
out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
|
||||
#if USE_DYNAMIC_MODE
|
||||
// dynamic mode
|
||||
if(argc != 23)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
const index_t N = std::stoi(argv[6]);
|
||||
const index_t K0 = std::stoi(argv[7]);
|
||||
const index_t K1 = std::stoi(argv[8]);
|
||||
const index_t C0 = std::stoi(argv[9]);
|
||||
const index_t C1 = std::stoi(argv[10]);
|
||||
const index_t Y = std::stoi(argv[11]);
|
||||
const index_t X = std::stoi(argv[12]);
|
||||
const index_t Hi = std::stoi(argv[13]);
|
||||
const index_t Wi = std::stoi(argv[14]);
|
||||
|
||||
const index_t conv_stride_h = std::stoi(argv[15]);
|
||||
const index_t conv_stride_w = std::stoi(argv[16]);
|
||||
const index_t conv_dilation_h = std::stoi(argv[17]);
|
||||
const index_t conv_dilation_w = std::stoi(argv[18]);
|
||||
const index_t in_left_pad_h = std::stoi(argv[19]);
|
||||
const index_t in_left_pad_w = std::stoi(argv[20]);
|
||||
const index_t in_right_pad_h = std::stoi(argv[21]);
|
||||
const index_t in_right_pad_w = std::stoi(argv[22]);
|
||||
|
||||
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
|
||||
const index_t XEff = (X - 1) * conv_dilation_w + 1;
|
||||
|
||||
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
|
||||
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
#else
|
||||
// static mode
|
||||
if(argc < 6)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
// constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid;
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
#if 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<1>{};
|
||||
constexpr auto K1 = Number<4>{};
|
||||
#elif 1
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<1>{};
|
||||
constexpr auto X = Number<1>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<540>{};
|
||||
constexpr auto Wi = Number<960>{};
|
||||
constexpr auto Y = Number<1>{};
|
||||
constexpr auto X = Number<1>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<128>{};
|
||||
constexpr auto Hi = Number<270>{};
|
||||
constexpr auto Wi = Number<480>{};
|
||||
constexpr auto Y = Number<1>{};
|
||||
constexpr auto X = Number<1>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#endif
|
||||
|
||||
constexpr auto conv_stride_h = I1;
|
||||
constexpr auto conv_stride_w = I1;
|
||||
constexpr auto conv_dilation_h = I1;
|
||||
constexpr auto conv_dilation_w = I1;
|
||||
|
||||
#if 1
|
||||
constexpr auto in_left_pad_h = I1;
|
||||
constexpr auto in_left_pad_w = I1;
|
||||
constexpr auto in_right_pad_h = I1;
|
||||
constexpr auto in_right_pad_w = I1;
|
||||
#else
|
||||
constexpr auto in_left_pad_h = I0;
|
||||
constexpr auto in_left_pad_w = I0;
|
||||
constexpr auto in_right_pad_h = I0;
|
||||
constexpr auto in_right_pad_w = I0;
|
||||
#endif
|
||||
|
||||
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
|
||||
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
|
||||
|
||||
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
|
||||
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
#elif 1
|
||||
using in_data_t = half_t;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = half_t;
|
||||
#elif 1
|
||||
using in_data_t = int8_t;
|
||||
using acc_data_t = int32_t;
|
||||
using out_data_t = int8_t;
|
||||
#endif
|
||||
|
||||
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
|
||||
bias_lengths_host(2);
|
||||
|
||||
in_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
in_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
in_lengths_host[2] = static_cast<std::size_t>(Hi);
|
||||
in_lengths_host[3] = static_cast<std::size_t>(Wi);
|
||||
in_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
|
||||
wei_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
wei_lengths_host[2] = static_cast<std::size_t>(Y);
|
||||
wei_lengths_host[3] = static_cast<std::size_t>(X);
|
||||
wei_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
out_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
out_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
out_lengths_host[2] = static_cast<std::size_t>(Ho);
|
||||
out_lengths_host[3] = static_cast<std::size_t>(Wo);
|
||||
out_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
bias_lengths_host[0] = static_cast<std::size_t>(K0);
|
||||
bias_lengths_host[1] = static_cast<std::size_t>(K1);
|
||||
|
||||
Tensor<in_data_t> in(in_lengths_host);
|
||||
Tensor<in_data_t> wei(wei_lengths_host);
|
||||
Tensor<out_data_t> bias(bias_lengths_host);
|
||||
Tensor<out_data_t> out_host(out_lengths_host);
|
||||
Tensor<out_data_t> out_device(out_lengths_host);
|
||||
|
||||
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
|
||||
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
|
||||
ostream_HostTensorDescriptor(bias.mDesc, std::cout << "bias: ");
|
||||
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
|
||||
|
||||
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
|
||||
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
// no initialization
|
||||
break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 3:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 4:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 5:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
|
||||
|
||||
auto gen_wei = [](auto... is) {
|
||||
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
|
||||
};
|
||||
wei.GenerateTensorValue(gen_wei, num_thread);
|
||||
}
|
||||
|
||||
auto f_make_for_device_nchwc = [&]() {
|
||||
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
|
||||
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
|
||||
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
|
||||
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
|
||||
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
|
||||
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
|
||||
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
|
||||
|
||||
return make_tuple(in_lengths_dev,
|
||||
wei_lengths_dev,
|
||||
out_lengths_dev,
|
||||
conv_strides_dev,
|
||||
conv_dilations_dev,
|
||||
in_left_pads_dev,
|
||||
in_right_pads_dev);
|
||||
};
|
||||
|
||||
#if USE_CONV_FWD_V5R1_NCHWC
|
||||
if(algo == ConvForwardAlgo::V5R1NCHWC)
|
||||
{
|
||||
const auto tmp = f_make_for_device_nchwc();
|
||||
|
||||
device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
|
||||
acc_data_t,
|
||||
out_data_t,
|
||||
activ_type>(
|
||||
tmp[I0],
|
||||
tmp[I1],
|
||||
tmp[I2],
|
||||
tmp[I3],
|
||||
tmp[I4],
|
||||
tmp[I5],
|
||||
tmp[I6],
|
||||
in,
|
||||
wei,
|
||||
bias,
|
||||
out_device,
|
||||
nrepeat);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
host_direct_convolution_nchwc(in,
|
||||
wei,
|
||||
bias,
|
||||
out_host,
|
||||
make_tuple(conv_stride_h, conv_stride_w),
|
||||
make_tuple(conv_dilation_h, conv_dilation_w),
|
||||
make_tuple(in_left_pad_h, in_left_pad_w),
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(out_host, out_device);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "bias: ", bias.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,413 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
#define USE_DYNAMIC_MODE 0
|
||||
#define USE_CONV_FWD_V5R1_NCHWC 1
|
||||
|
||||
enum ConvForwardAlgo
|
||||
{
|
||||
V5R1NCHWC // 0
|
||||
};
|
||||
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
const Tensor<TOut>& bias,
|
||||
Tensor<TOut>& out_host,
|
||||
Tensor<TOut>& max_host,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&,
|
||||
const ck::ActivTypeEnum_t activ_type)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
|
||||
double v = 0;
|
||||
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
|
||||
|
||||
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
|
||||
{
|
||||
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[3])
|
||||
{
|
||||
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
|
||||
{
|
||||
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
|
||||
static_cast<const double>(wei(k, c0, y, x, c1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
v += bias(k0, k1);
|
||||
v = activ(v, activ_type);
|
||||
|
||||
out_host(n, k0, ho, wo, k1) = v;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
out_host.mDesc.GetLengths()[0],
|
||||
out_host.mDesc.GetLengths()[1],
|
||||
out_host.mDesc.GetLengths()[2],
|
||||
out_host.mDesc.GetLengths()[3],
|
||||
out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
|
||||
|
||||
auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
|
||||
auto hx = ho * 2;
|
||||
auto wx = wo * 2;
|
||||
|
||||
auto v0 = out_host(n, k0, hx, wx, k1);
|
||||
auto v1 = out_host(n, k0, hx, wx + 1, k1);
|
||||
auto v2 = out_host(n, k0, hx + 1, wx, k1);
|
||||
auto v3 = out_host(n, k0, hx + 1, wx + 1, k1);
|
||||
|
||||
max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3});
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(maxpool_nchw,
|
||||
max_host.mDesc.GetLengths()[0],
|
||||
max_host.mDesc.GetLengths()[1],
|
||||
max_host.mDesc.GetLengths()[2],
|
||||
max_host.mDesc.GetLengths()[3],
|
||||
max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
#if USE_DYNAMIC_MODE
|
||||
// dynamic mode
|
||||
if(argc != 23)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
const index_t N = std::stoi(argv[6]);
|
||||
const index_t K0 = std::stoi(argv[7]);
|
||||
const index_t K1 = std::stoi(argv[8]);
|
||||
const index_t C0 = std::stoi(argv[9]);
|
||||
const index_t C1 = std::stoi(argv[10]);
|
||||
const index_t Y = std::stoi(argv[11]);
|
||||
const index_t X = std::stoi(argv[12]);
|
||||
const index_t Hi = std::stoi(argv[13]);
|
||||
const index_t Wi = std::stoi(argv[14]);
|
||||
|
||||
const index_t conv_stride_h = std::stoi(argv[15]);
|
||||
const index_t conv_stride_w = std::stoi(argv[16]);
|
||||
const index_t conv_dilation_h = std::stoi(argv[17]);
|
||||
const index_t conv_dilation_w = std::stoi(argv[18]);
|
||||
const index_t in_left_pad_h = std::stoi(argv[19]);
|
||||
const index_t in_left_pad_w = std::stoi(argv[20]);
|
||||
const index_t in_right_pad_h = std::stoi(argv[21]);
|
||||
const index_t in_right_pad_w = std::stoi(argv[22]);
|
||||
|
||||
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
|
||||
const index_t XEff = (X - 1) * conv_dilation_w + 1;
|
||||
|
||||
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
|
||||
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
|
||||
const index_t Ho_2 = Ho / 2;
|
||||
const index_t Wo_2 = Wo / 2;
|
||||
#else
|
||||
// static mode
|
||||
if(argc < 6)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
#if 1
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<3>{};
|
||||
constexpr auto C1 = Number<4>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<540>{};
|
||||
constexpr auto Wi = Number<960>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<128>{};
|
||||
constexpr auto Hi = Number<270>{};
|
||||
constexpr auto Wi = Number<480>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#endif
|
||||
|
||||
constexpr auto conv_stride_h = I1;
|
||||
constexpr auto conv_stride_w = I1;
|
||||
constexpr auto conv_dilation_h = I1;
|
||||
constexpr auto conv_dilation_w = I1;
|
||||
constexpr auto in_left_pad_h = I1;
|
||||
constexpr auto in_left_pad_w = I1;
|
||||
constexpr auto in_right_pad_h = I1;
|
||||
constexpr auto in_right_pad_w = I1;
|
||||
|
||||
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
|
||||
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
|
||||
|
||||
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
|
||||
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
|
||||
|
||||
constexpr auto Ho_2 = Number<Ho / 2>{};
|
||||
constexpr auto Wo_2 = Number<Wo / 2>{};
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
#elif 1
|
||||
using in_data_t = half_t;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = half_t;
|
||||
#elif 1
|
||||
using in_data_t = int8_t;
|
||||
using acc_data_t = int32_t;
|
||||
using out_data_t = int8_t;
|
||||
#endif
|
||||
|
||||
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
|
||||
max_lengths_host(5), bias_lengths_host(2);
|
||||
|
||||
in_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
in_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
in_lengths_host[2] = static_cast<std::size_t>(Hi);
|
||||
in_lengths_host[3] = static_cast<std::size_t>(Wi);
|
||||
in_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
|
||||
wei_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
wei_lengths_host[2] = static_cast<std::size_t>(Y);
|
||||
wei_lengths_host[3] = static_cast<std::size_t>(X);
|
||||
wei_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
out_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
out_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
out_lengths_host[2] = static_cast<std::size_t>(Ho);
|
||||
out_lengths_host[3] = static_cast<std::size_t>(Wo);
|
||||
out_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
max_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
max_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
max_lengths_host[2] = static_cast<std::size_t>(Ho_2);
|
||||
max_lengths_host[3] = static_cast<std::size_t>(Wo_2);
|
||||
max_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
bias_lengths_host[0] = static_cast<std::size_t>(K0);
|
||||
bias_lengths_host[1] = static_cast<std::size_t>(K1);
|
||||
|
||||
Tensor<in_data_t> in(in_lengths_host);
|
||||
Tensor<in_data_t> wei(wei_lengths_host);
|
||||
Tensor<out_data_t> bias(bias_lengths_host);
|
||||
Tensor<out_data_t> out_device(out_lengths_host);
|
||||
Tensor<out_data_t> out_host(out_lengths_host);
|
||||
Tensor<in_data_t> max_device(max_lengths_host);
|
||||
Tensor<in_data_t> max_host(max_lengths_host);
|
||||
|
||||
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
|
||||
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
|
||||
|
||||
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
|
||||
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
// no initialization
|
||||
break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 3:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 4:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 5:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
|
||||
|
||||
auto gen_wei = [](auto... is) {
|
||||
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
|
||||
};
|
||||
wei.GenerateTensorValue(gen_wei, num_thread);
|
||||
}
|
||||
|
||||
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
|
||||
auto f_make_for_device_nchwc = [&]() {
|
||||
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
|
||||
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
|
||||
const auto max_lengths_dev = make_tuple(N, K0, Ho_2, Wo_2, K1);
|
||||
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
|
||||
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
|
||||
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
|
||||
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
|
||||
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
|
||||
|
||||
return make_tuple(in_lengths_dev,
|
||||
wei_lengths_dev,
|
||||
max_lengths_dev,
|
||||
out_lengths_dev,
|
||||
conv_strides_dev,
|
||||
conv_dilations_dev,
|
||||
in_left_pads_dev,
|
||||
in_right_pads_dev);
|
||||
};
|
||||
|
||||
#if USE_CONV_FWD_V5R1_NCHWC
|
||||
if(algo == ConvForwardAlgo::V5R1NCHWC)
|
||||
{
|
||||
const auto tmp = f_make_for_device_nchwc();
|
||||
|
||||
device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<
|
||||
in_data_t,
|
||||
acc_data_t,
|
||||
out_data_t,
|
||||
activ_type>(tmp[I0], // in_lengths_dev
|
||||
tmp[I1], // wei_lengths_dev
|
||||
tmp[I2], // max_lengths_dev
|
||||
tmp[I3], // out_lengths_dev
|
||||
tmp[I4], // conv_strides_dev
|
||||
tmp[I5], // conv_dilations_dev
|
||||
tmp[I6], // in_left_pads_dev
|
||||
tmp[I7], // in_right_pads_dev
|
||||
in,
|
||||
wei,
|
||||
bias,
|
||||
out_device,
|
||||
max_device,
|
||||
nrepeat);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
host_direct_convolution_maxpool_nchwc(in,
|
||||
wei,
|
||||
bias,
|
||||
out_host,
|
||||
max_host,
|
||||
make_tuple(conv_stride_h, conv_stride_w),
|
||||
make_tuple(conv_dilation_h, conv_dilation_w),
|
||||
make_tuple(in_left_pad_h, in_left_pad_w),
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(out_host, out_device);
|
||||
check_error(max_host, max_device);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
|
||||
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
|
||||
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
|
||||
// std::endl;
|
||||
LogRangeAsType<float>(std::cout << "max_host: ", max_host.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "max_device: ", max_device.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -74,4 +74,17 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
|
||||
return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline auto activ(T v, const ck::ActivTypeEnum_t activ_type)
|
||||
{
|
||||
const T alpha = 0.3;
|
||||
switch(activ_type)
|
||||
{
|
||||
case ck::ActivTypeEnum_t::None: return v;
|
||||
case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v);
|
||||
case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v)));
|
||||
default: throw std::runtime_error("unsupported activ type"); break;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -257,6 +257,18 @@ struct Tensor
|
||||
mDesc.GetLengths()[3])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 5: {
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
|
||||
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[2],
|
||||
mDesc.GetLengths()[3],
|
||||
mDesc.GetLengths()[4])(num_thread);
|
||||
break;
|
||||
}
|
||||
default: throw std::runtime_error("unspported dimension");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user