mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
refactor
This commit is contained in:
@@ -391,7 +391,7 @@ int main()
|
||||
|
||||
constexpr unsigned HPad = 0;
|
||||
constexpr unsigned WPad = 0;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 3x3, 34x34
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 256;
|
||||
@@ -430,6 +430,9 @@ int main()
|
||||
constexpr unsigned K = 64;
|
||||
constexpr unsigned S = 5;
|
||||
constexpr unsigned R = 5;
|
||||
|
||||
constexpr unsigned HPad = 0;
|
||||
constexpr unsigned WPad = 0;
|
||||
#elif 0
|
||||
// 7x7, 38x38
|
||||
constexpr unsigned N = 64;
|
||||
@@ -439,6 +442,9 @@ int main()
|
||||
constexpr unsigned K = 64;
|
||||
constexpr unsigned S = 7;
|
||||
constexpr unsigned R = 7;
|
||||
|
||||
constexpr unsigned HPad = 0;
|
||||
constexpr unsigned WPad = 0;
|
||||
#elif 0
|
||||
// 3x3, 58x58
|
||||
constexpr unsigned N = 16;
|
||||
@@ -484,7 +490,7 @@ int main()
|
||||
|
||||
constexpr unsigned HPad = 1;
|
||||
constexpr unsigned WPad = 1;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x1 filter, 28x28 image
|
||||
constexpr unsigned N = 16;
|
||||
constexpr unsigned C = 256;
|
||||
@@ -608,7 +614,7 @@ int main()
|
||||
nrepeat);
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
if(S == 3 && R == 3)
|
||||
{
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
|
||||
|
||||
@@ -137,12 +137,18 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
constexpr unsigned HoPerThread = 1;
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 4; // not used, yet
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// for 7x7, 38x38
|
||||
constexpr unsigned NPerBlock = 8;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
constexpr unsigned CPerBlock = 1;
|
||||
constexpr unsigned HoPerBlock = 4;
|
||||
constexpr unsigned WoPerBlock = 4;
|
||||
|
||||
@@ -152,6 +158,12 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
constexpr unsigned HoPerThread = 1;
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
|
||||
|
||||
constexpr unsigned InBlockCopyDataPerRead = 4; // not used, yet
|
||||
constexpr unsigned WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 0
|
||||
// for 3x3, 56x56
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
template <unsigned NRow_, unsigned NCol_, unsigned RowStride_>
|
||||
struct ConstantMatrixDescriptor
|
||||
{
|
||||
__host__ __device__ ConstantMatrixDescriptor()
|
||||
__host__ __device__ constexpr ConstantMatrixDescriptor()
|
||||
{
|
||||
static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!");
|
||||
}
|
||||
|
||||
@@ -124,12 +124,12 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
|
||||
{
|
||||
if(TransA && (!TransB) && (!TransC))
|
||||
{
|
||||
constexpr auto True = Constant<bool, true>{};
|
||||
constexpr auto False = Constant<bool, false>{};
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
|
||||
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
|
||||
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
|
||||
constexpr auto a_block_mtx = BlockMatrixA{};
|
||||
constexpr auto b_block_mtx = BlockMatrixB{};
|
||||
constexpr auto c_thread_mtx = ThreadMatrixC{};
|
||||
|
||||
constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed
|
||||
|
||||
@@ -137,11 +137,11 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
|
||||
constexpr unsigned NPerThread = c_thread_mtx.NCol();
|
||||
|
||||
// a is transposed, b is not
|
||||
const auto a_thread_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto a_thread_mtx =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
|
||||
|
||||
const auto b_thread_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto b_thread_mtx =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
|
||||
|
||||
FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
|
||||
FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
|
||||
@@ -278,8 +278,8 @@ struct BlockwiseGemmBlockABlockBThreadC
|
||||
|
||||
if(TransA && (!TransB) && (!TransC))
|
||||
{
|
||||
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
|
||||
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
|
||||
constexpr auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
|
||||
constexpr auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
|
||||
|
||||
static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
|
||||
"wrong! k dimension not consistent!");
|
||||
@@ -287,7 +287,7 @@ struct BlockwiseGemmBlockABlockBThreadC
|
||||
constexpr unsigned MPerBlock = a_block_mtx.NCol();
|
||||
constexpr unsigned NPerBlock = b_block_mtx.NCol();
|
||||
|
||||
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
|
||||
constexpr auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
|
||||
|
||||
// divide thread work
|
||||
constexpr unsigned MPerThread = c_thread_mtx.NRow();
|
||||
@@ -374,12 +374,12 @@ struct BlockwiseGemmBlockABlockBThreadC
|
||||
{
|
||||
if(TransA && (!TransB) && (!TransC))
|
||||
{
|
||||
constexpr auto True = Constant<bool, true>{};
|
||||
constexpr auto False = Constant<bool, false>{};
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
|
||||
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
|
||||
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
|
||||
constexpr auto a_block_mtx = BlockMatrixA{};
|
||||
constexpr auto b_block_mtx = BlockMatrixB{};
|
||||
constexpr auto c_thread_mtx = ThreadMatrixC{};
|
||||
|
||||
constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed
|
||||
|
||||
@@ -387,11 +387,11 @@ struct BlockwiseGemmBlockABlockBThreadC
|
||||
constexpr unsigned NPerThread = c_thread_mtx.NCol();
|
||||
|
||||
// a is transposed, b is not
|
||||
const auto a_thread_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto a_thread_mtx =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
|
||||
|
||||
const auto b_thread_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto b_thread_mtx =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
|
||||
|
||||
FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
|
||||
FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
|
||||
@@ -556,8 +556,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
FloatC* p_c_thread,
|
||||
Accumulator f_accum) const
|
||||
{
|
||||
constexpr auto True = Constant<bool, true>{};
|
||||
constexpr auto False = Constant<bool, false>{};
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
|
||||
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
|
||||
@@ -648,12 +648,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
FloatC* p_c_thread,
|
||||
Accumulator f_accum) const
|
||||
{
|
||||
constexpr auto True = Constant<bool, true>{};
|
||||
constexpr auto False = Constant<bool, false>{};
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
|
||||
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
|
||||
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
|
||||
constexpr auto a_block_mtx = BlockMatrixA{};
|
||||
constexpr auto b_block_mtx = BlockMatrixB{};
|
||||
constexpr auto c_thread_mtx = ThreadMatrixC{};
|
||||
|
||||
constexpr unsigned M = a_block_mtx.NCol();
|
||||
constexpr unsigned N = b_block_mtx.NCol();
|
||||
@@ -663,22 +663,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
constexpr unsigned NPerThread = c_thread_mtx.NCol();
|
||||
|
||||
// thread A, B for GEMM
|
||||
const auto a_thread_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto a_thread_mtx =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
|
||||
|
||||
const auto b_thread_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto b_thread_mtx =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
|
||||
|
||||
// thread A-sub, B-sub for copy
|
||||
const auto a_thread_sub_mtx =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{},
|
||||
Number<MPerThreadSubC>{},
|
||||
Number<MPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
|
||||
|
||||
const auto b_thread_sub_mtx =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{},
|
||||
Number<NPerThreadSubC>{},
|
||||
Number<NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
|
||||
|
||||
// register
|
||||
FloatA p_a_thread_0[a_thread_mtx.GetElementSpace()];
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#define WARPSIZE 32;
|
||||
__device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
|
||||
|
||||
__device__ unsigned get_block_1d_id() { return blockIdx.x; }
|
||||
|
||||
template <class T1, class T2>
|
||||
struct is_same
|
||||
@@ -14,20 +16,16 @@ struct is_same<T, T>
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
__device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
|
||||
|
||||
__device__ unsigned get_block_1d_id() { return blockIdx.x; }
|
||||
|
||||
template <class T, T N>
|
||||
struct Constant
|
||||
struct integral_constant
|
||||
{
|
||||
static const T mValue = N;
|
||||
static const T value = N;
|
||||
|
||||
__host__ __device__ constexpr T Get() const { return mValue; }
|
||||
__host__ __device__ constexpr T Get() const { return value; }
|
||||
};
|
||||
|
||||
template <unsigned N>
|
||||
using Number = Constant<unsigned, N>;
|
||||
using Number = integral_constant<unsigned, N>;
|
||||
|
||||
template <unsigned... Is>
|
||||
struct Sequence
|
||||
@@ -64,4 +62,4 @@ struct Sequence
|
||||
printf("Sequence::ReorderByPutOldToNew not implemented");
|
||||
assert(false);
|
||||
}
|
||||
};
|
||||
};
|
||||
@@ -156,18 +156,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<KPerBlock>{},
|
||||
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxwn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxwn_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
|
||||
|
||||
const auto blockwise_batch_gemm =
|
||||
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
|
||||
|
||||
@@ -176,18 +176,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<KPerBlock>{},
|
||||
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxwn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxwn_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
|
||||
|
||||
const auto blockwise_batch_gemm =
|
||||
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
|
||||
|
||||
@@ -176,18 +176,16 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<KPerBlock>{},
|
||||
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxwn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxwn_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
|
||||
|
||||
const auto blockwise_batch_gemm =
|
||||
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
|
||||
|
||||
@@ -115,16 +115,16 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
|
||||
|
||||
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxwn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxwn_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
|
||||
|
||||
const auto blockwise_batch_gemm =
|
||||
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
|
||||
|
||||
@@ -121,16 +121,16 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
|
||||
|
||||
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxwn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<WoPerBlock * NPerBlock>{},
|
||||
Number<in_chwn_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxwn_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
|
||||
|
||||
const auto blockwise_batch_gemm =
|
||||
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
|
||||
|
||||
@@ -172,18 +172,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
|
||||
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
|
||||
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
|
||||
// c_mtx[K,B] is out_block[K,B]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<KPerBlock>{},
|
||||
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<BPerBlock>{},
|
||||
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxb_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
|
||||
|
||||
#if 0
|
||||
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
|
||||
@@ -258,15 +254,15 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
|
||||
{
|
||||
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#else
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#endif
|
||||
(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
|
||||
p_in_block + s * Wi + r,
|
||||
p_out_thread,
|
||||
f_accum);
|
||||
(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
|
||||
p_in_block + s * Wi + r,
|
||||
p_out_thread,
|
||||
f_accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,18 +172,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
|
||||
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
|
||||
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
|
||||
// c_mtx[K,B] is out_block[K,B]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<KPerBlock>{},
|
||||
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<BPerBlock>{},
|
||||
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxb_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
|
||||
|
||||
#if 0
|
||||
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
|
||||
|
||||
@@ -140,16 +140,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
|
||||
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
|
||||
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
|
||||
// c_mtx[K,B] is out_block[K,B]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
|
||||
|
||||
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<BPerBlock>{},
|
||||
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxb_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
|
||||
|
||||
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
|
||||
decltype(a_cxk_block_mtx_desc),
|
||||
|
||||
@@ -156,16 +156,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
|
||||
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
|
||||
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
|
||||
// c_mtx[K,B] is out_block[K,B]
|
||||
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
|
||||
constexpr auto a_cxk_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
|
||||
|
||||
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{},
|
||||
Number<BPerBlock>{},
|
||||
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
|
||||
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
|
||||
|
||||
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
|
||||
constexpr auto c_kxb_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
|
||||
|
||||
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
|
||||
decltype(a_cxk_block_mtx_desc),
|
||||
|
||||
@@ -30,13 +30,13 @@ template <class MatrixA,
|
||||
class FloatC,
|
||||
class Accumulator>
|
||||
__device__ void threadwise_gemm(MatrixA,
|
||||
Constant<bool, TransA>,
|
||||
integral_constant<bool, TransA>,
|
||||
FloatA* const p_a_thread,
|
||||
MatrixB,
|
||||
Constant<bool, TransB>,
|
||||
integral_constant<bool, TransB>,
|
||||
FloatB* const p_b_thread,
|
||||
MatrixC,
|
||||
Constant<bool, TransC>,
|
||||
integral_constant<bool, TransC>,
|
||||
FloatC* p_c_thread,
|
||||
Accumulator f_accum)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user