mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
changed to dynamics lds allocation
This commit is contained in:
@@ -220,7 +220,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr index_t BlockSize = 128;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 1x1, 14x14, Vega 20, hack CPerBlock = 1
|
||||
constexpr index_t BPerBlock = 64;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
@@ -306,6 +306,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
float time = launch_kernel(gridwise_conv.Run,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
gridwise_conv.GetSharedMemoryUsage(),
|
||||
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
|
||||
|
||||
@@ -29,14 +29,14 @@ struct KernelTimer
|
||||
};
|
||||
|
||||
template <typename... Args, typename F>
|
||||
float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args)
|
||||
float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
||||
{
|
||||
KernelTimer timer;
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
timer.Start();
|
||||
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, 0, 0, args...);
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
|
||||
|
||||
timer.End();
|
||||
|
||||
@@ -47,7 +47,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args)
|
||||
|
||||
timer.Start();
|
||||
|
||||
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, 0, 0);
|
||||
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, 0);
|
||||
|
||||
timer.End();
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ template <index_t GridSize,
|
||||
class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
|
||||
{
|
||||
public:
|
||||
__host__ __device__ static index_t GetSharedMemorySize()
|
||||
__host__ __device__ constexpr index_t GetInputBlockElementSpace() const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -46,7 +46,6 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
|
||||
|
||||
constexpr auto in_chwn_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_khwn_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
|
||||
constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
|
||||
@@ -56,29 +55,59 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
|
||||
|
||||
constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
|
||||
|
||||
// tensor view of blockwise input and weight
|
||||
// tensor view of blockwise input
|
||||
// be careful of alignment
|
||||
constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
|
||||
|
||||
constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_kb_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t max_align =
|
||||
mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
|
||||
return in_cb_block_desc.GetElementSpace(Number<max_align>{});
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr index_t GetWeightBlockElementSpace() const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_chwn_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
|
||||
|
||||
constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
|
||||
constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
|
||||
|
||||
constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
|
||||
constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);
|
||||
|
||||
constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
|
||||
|
||||
// tensor view of blockwise weight
|
||||
// be careful of alignment
|
||||
constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t in_block_element_space =
|
||||
in_cb_block_desc.GetElementSpace(Number<max_align>{});
|
||||
constexpr index_t max_align =
|
||||
mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
|
||||
|
||||
constexpr index_t wei_block_element_space =
|
||||
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
|
||||
return wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
|
||||
}
|
||||
|
||||
return (in_block_element_space + wei_block_element_space) * sizeof(Float);
|
||||
__host__ __device__ constexpr index_t GetSharedMemoryUsage() const
|
||||
{
|
||||
|
||||
return (GetInputBlockElementSpace() + GetWeightBlockElementSpace()) * sizeof(Float);
|
||||
}
|
||||
|
||||
__device__ constexpr static Float* GetSharedMemoryBegin()
|
||||
{
|
||||
extern __shared__ Float s[];
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
__global__ static void Run(const Float* const __restrict__ p_in_global,
|
||||
@@ -251,8 +280,8 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
|
||||
constexpr index_t wei_block_element_space =
|
||||
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
|
||||
|
||||
__shared__ Float p_in_block[in_block_element_space];
|
||||
__shared__ Float p_wei_block[wei_block_element_space];
|
||||
Float* const p_in_block = GetSharedMemoryBegin();
|
||||
Float* const p_wei_block = GetSharedMemoryBegin() + in_block_element_space;
|
||||
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
|
||||
@@ -288,7 +317,7 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
|
||||
blockwise_gemm.Run
|
||||
#elif 1
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 0
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
|
||||
Reference in New Issue
Block a user