mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
working on reducing index calculation...
This commit is contained in:
@@ -396,6 +396,9 @@ int main()
|
||||
constexpr unsigned K = 64;
|
||||
constexpr unsigned S = 3;
|
||||
constexpr unsigned R = 3;
|
||||
|
||||
constexpr unsigned HPad = 0;
|
||||
constexpr unsigned WPad = 0;
|
||||
#elif 0
|
||||
// 3x3, 56x56
|
||||
constexpr unsigned N = 64;
|
||||
@@ -586,7 +589,7 @@ int main()
|
||||
#endif
|
||||
(in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
|
||||
|
||||
#elif 1
|
||||
#elif 0
|
||||
device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(in_nchw_desc,
|
||||
in_nchw,
|
||||
wei_kcsr_desc,
|
||||
@@ -598,7 +601,7 @@ int main()
|
||||
nrepeat);
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
if(S == 3 && R == 3)
|
||||
{
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
|
||||
|
||||
@@ -87,7 +87,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned BlockSize = 8;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
@@ -162,7 +162,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
constexpr unsigned WoPerThread = 1;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// for 1x1, 28x28
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 128;
|
||||
|
||||
@@ -1,76 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
template <class Float, class SrcMatrix, class DstMatrix, unsigned NRow, unsigned NCol>
|
||||
__device__ void
|
||||
threadwise_matrix_copy(SrcMatrix, Float* const p_src, DstMatrix, Float* p_dst, Sequence<NRow, NCol>)
|
||||
{
|
||||
const auto src_mtx = SrcMatrix{}; // constexpr doesn't compile
|
||||
const auto dst_mtx = DstMatrix{}; // constexpr doesn't compile
|
||||
|
||||
for(unsigned i = 0; i < NRow; ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < NCol; ++j)
|
||||
{
|
||||
const unsigned src_index = src_mtx.Get1dIndex(i, j);
|
||||
const unsigned dst_index = dst_mtx.Get1dIndex(i, j);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class MatrixA,
|
||||
class MatrixB,
|
||||
class MatrixC,
|
||||
bool TransA,
|
||||
bool TransB,
|
||||
bool TransC,
|
||||
class FloatA,
|
||||
class FloatB,
|
||||
class FloatC,
|
||||
class Accumulator>
|
||||
__device__ void threadwise_gemm(MatrixA,
|
||||
Constant<bool, TransA>,
|
||||
FloatA* const p_a_thread,
|
||||
MatrixB,
|
||||
Constant<bool, TransB>,
|
||||
FloatB* const p_b_thread,
|
||||
MatrixC,
|
||||
Constant<bool, TransC>,
|
||||
FloatC* p_c_thread,
|
||||
Accumulator f_accum)
|
||||
{
|
||||
if(TransA && (!TransB) && (!TransC))
|
||||
{
|
||||
const auto a_mtx = MatrixA{}; // constexpr doesn't compile
|
||||
const auto b_mtx = MatrixB{}; // constexpr doesn't compile
|
||||
const auto c_mtx = MatrixC{}; // constexpr doesn't compile
|
||||
|
||||
constexpr unsigned M = c_mtx.NRow();
|
||||
constexpr unsigned N = c_mtx.NCol();
|
||||
constexpr unsigned K = a_mtx.NRow(); // A is transposed
|
||||
|
||||
for(unsigned i = 0; i < M; ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < N; ++j)
|
||||
{
|
||||
for(unsigned k = 0; k < K; ++k)
|
||||
{
|
||||
const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed
|
||||
const unsigned bindex = b_mtx.Get1dIndex(k, j);
|
||||
const unsigned cindex = c_mtx.Get1dIndex(i, j);
|
||||
|
||||
f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// not implemented
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
#include "threadwise_gemm.cuh"
|
||||
|
||||
template <unsigned BlockSize,
|
||||
class BlockMatrixA,
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "ConstantMatrixDescriptor.cuh"
|
||||
#include "blockwise_4d_tensor_op.cuh"
|
||||
#include "threadwise_4d_tensor_op.cuh"
|
||||
#include "gemm.cuh"
|
||||
#include "blockwise_gemm.cuh"
|
||||
|
||||
template <unsigned GridSize,
|
||||
unsigned BlockSize,
|
||||
@@ -169,23 +169,26 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
Float* p_in_global_block_begin =
|
||||
p_in_global + in_chwn_global_desc.Get1dIndex(
|
||||
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
|
||||
|
||||
Float* p_wei_global_block_begin =
|
||||
p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_begin += CPerBlock * in_chwn_global_desc.GetStride(I0),
|
||||
p_wei_global_block_begin += CPerBlock * wei_csrk_global_desc.GetStride(I0),
|
||||
__syncthreads())
|
||||
{
|
||||
#if 1
|
||||
// input: global mem to LDS,
|
||||
blockwise_in_copy.run(p_in_global + in_chwn_global_desc.Get1dIndex(c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin,
|
||||
n_block_data_begin),
|
||||
p_in_block);
|
||||
blockwise_in_copy.run(p_in_global_block_begin, p_in_block);
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
// weight: global mem to LDS,
|
||||
blockwise_wei_copy.run(p_wei_global + wei_csrk_global_desc.Get1dIndex(
|
||||
c_block_data_begin, 0, 0, k_block_data_begin),
|
||||
p_wei_block);
|
||||
blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "ConstantMatrixDescriptor.cuh"
|
||||
#include "blockwise_4d_tensor_op.cuh"
|
||||
#include "threadwise_4d_tensor_op.cuh"
|
||||
#include "gemm.cuh"
|
||||
#include "blockwise_gemm.cuh"
|
||||
|
||||
template <unsigned GridSize,
|
||||
unsigned BlockSize,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "ConstantMatrixDescriptor.cuh"
|
||||
#include "blockwise_4d_tensor_op.cuh"
|
||||
#include "threadwise_4d_tensor_op.cuh"
|
||||
#include "gemm.cuh"
|
||||
#include "blockwise_gemm.cuh"
|
||||
|
||||
template <unsigned GridSize,
|
||||
unsigned BlockSize,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "ConstantMatrixDescriptor.cuh"
|
||||
#include "blockwise_4d_tensor_op.cuh"
|
||||
#include "threadwise_4d_tensor_op.cuh"
|
||||
#include "gemm.cuh"
|
||||
#include "blockwise_gemm.cuh"
|
||||
|
||||
template <unsigned GridSize,
|
||||
unsigned BlockSize,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "blockwise_4d_tensor_op.cuh"
|
||||
#include "blockwise_2d_tensor_op.cuh"
|
||||
#include "threadwise_2d_tensor_op.cuh"
|
||||
#include "gemm.cuh"
|
||||
#include "blockwise_gemm.cuh"
|
||||
|
||||
// define B = flatten(N, Hi, Wi)
|
||||
template <unsigned GridSize,
|
||||
@@ -128,14 +128,12 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [S,R,CPerBlock,KPerBlock]
|
||||
#if 1
|
||||
const auto blockwise_wei_copy =
|
||||
blockwise_4d_tensor_copy_1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_srck_global_desc),
|
||||
decltype(wei_srck_block_desc),
|
||||
decltype(wei_srck_block_desc.GetLengths())>{};
|
||||
#endif
|
||||
|
||||
// a series of blockwise GEMM
|
||||
// c_mtx += transpose(a_mtx) * b_mtx
|
||||
@@ -180,21 +178,25 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
Float* p_in_global_block_offset =
|
||||
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
|
||||
|
||||
Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_srck_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
|
||||
p_wei_global_block_offset += CPerBlock * wei_srck_global_desc.GetStride(I2),
|
||||
__syncthreads())
|
||||
{
|
||||
#if 1
|
||||
// input: global mem to LDS,
|
||||
blockwise_in_copy.run(
|
||||
p_in_global + in_cb_global_desc.Get1dIndex(c_block_data_begin, b_block_data_begin),
|
||||
p_in_block);
|
||||
blockwise_in_copy.run(p_in_global_block_offset, p_in_block);
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
// weight: global mem to LDS,
|
||||
blockwise_wei_copy.run(p_wei_global + wei_srck_global_desc.Get1dIndex(
|
||||
0, 0, c_block_data_begin, k_block_data_begin),
|
||||
p_wei_block);
|
||||
blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "blockwise_4d_tensor_op.cuh"
|
||||
#include "blockwise_2d_tensor_op.cuh"
|
||||
#include "threadwise_2d_tensor_op.cuh"
|
||||
#include "gemm.cuh"
|
||||
#include "blockwise_gemm.cuh"
|
||||
|
||||
// define B = N*Hi*Wi
|
||||
template <unsigned GridSize,
|
||||
|
||||
73
src/include/threadwise_gemm.cuh
Normal file
73
src/include/threadwise_gemm.cuh
Normal file
@@ -0,0 +1,73 @@
|
||||
#pragma once
|
||||
|
||||
template <class Float, class SrcMatrix, class DstMatrix, unsigned NRow, unsigned NCol>
|
||||
__device__ void
|
||||
threadwise_matrix_copy(SrcMatrix, Float* const p_src, DstMatrix, Float* p_dst, Sequence<NRow, NCol>)
|
||||
{
|
||||
const auto src_mtx = SrcMatrix{}; // constexpr doesn't compile
|
||||
const auto dst_mtx = DstMatrix{}; // constexpr doesn't compile
|
||||
|
||||
for(unsigned i = 0; i < NRow; ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < NCol; ++j)
|
||||
{
|
||||
const unsigned src_index = src_mtx.Get1dIndex(i, j);
|
||||
const unsigned dst_index = dst_mtx.Get1dIndex(i, j);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class MatrixA,
|
||||
class MatrixB,
|
||||
class MatrixC,
|
||||
bool TransA,
|
||||
bool TransB,
|
||||
bool TransC,
|
||||
class FloatA,
|
||||
class FloatB,
|
||||
class FloatC,
|
||||
class Accumulator>
|
||||
__device__ void threadwise_gemm(MatrixA,
|
||||
Constant<bool, TransA>,
|
||||
FloatA* const p_a_thread,
|
||||
MatrixB,
|
||||
Constant<bool, TransB>,
|
||||
FloatB* const p_b_thread,
|
||||
MatrixC,
|
||||
Constant<bool, TransC>,
|
||||
FloatC* p_c_thread,
|
||||
Accumulator f_accum)
|
||||
{
|
||||
if(TransA && (!TransB) && (!TransC))
|
||||
{
|
||||
const auto a_mtx = MatrixA{}; // constexpr doesn't compile
|
||||
const auto b_mtx = MatrixB{}; // constexpr doesn't compile
|
||||
const auto c_mtx = MatrixC{}; // constexpr doesn't compile
|
||||
|
||||
constexpr unsigned M = c_mtx.NRow();
|
||||
constexpr unsigned N = c_mtx.NCol();
|
||||
constexpr unsigned K = a_mtx.NRow(); // A is transposed
|
||||
|
||||
for(unsigned i = 0; i < M; ++i)
|
||||
{
|
||||
for(unsigned j = 0; j < N; ++j)
|
||||
{
|
||||
for(unsigned k = 0; k < K; ++k)
|
||||
{
|
||||
const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed
|
||||
const unsigned bindex = b_mtx.Get1dIndex(k, j);
|
||||
const unsigned cindex = c_mtx.Get1dIndex(i, j);
|
||||
|
||||
f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// not implemented
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user