mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 21:51:28 +00:00
Enable Async Copy for MI355 (#2425)
* add for async load builtin * add async load api * fix some compiling errors * fix a compiling error * fix some compiling errors * add a pipeline which copies from v4 * add a new pipeline for async load * fix some compiling errors * add async load tests * fix some issues in async load * fix * fix async inline assembly * fix async inline assembly * add ignore header file * comment some not gfx950 codes * comment some not gfx950 codes * fix a error * update async load apis * fix lds descriptor * fix a compiling error * fix some compiling errors * fix a descriptor issue * update lds descriptor * change async pipeline's tile distribution pattern from thread to warp * fix clang format * update async policy * fix a CRTP issue * fix a typo error * change lds layout * fix some sync issues * improve codes * delete the async test * fix a commented format issue * avoid compiling device functions when compile host * make gemm run * add the copy kernel support * finish the feature * Address comment * add the support for buffer_builtin * solved the merging problem * Comment Addressed --------- Co-authored-by: joye <joye@amd.com> Co-authored-by: joyeamd <John.Ye@amd.com>
This commit is contained in:
@@ -50,11 +50,12 @@ struct TileCopyShape
|
||||
static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N, "Inconsisten wave group size!");
|
||||
};
|
||||
|
||||
template <typename XDataType_, typename BlockShape_>
|
||||
template <typename XDataType_, typename BlockShape_, bool AsyncCopy_>
|
||||
struct TileCopyProblem
|
||||
{
|
||||
using XDataType = remove_cvref_t<XDataType_>;
|
||||
using BlockShape = remove_cvref_t<BlockShape_>;
|
||||
using XDataType = remove_cvref_t<XDataType_>;
|
||||
using BlockShape = remove_cvref_t<BlockShape_>;
|
||||
static constexpr bool AsyncCopy = AsyncCopy_;
|
||||
};
|
||||
|
||||
template <typename Problem_>
|
||||
@@ -63,6 +64,8 @@ struct TileCopy
|
||||
using Problem = ck_tile::remove_cvref_t<Problem_>;
|
||||
using XDataType = typename Problem::XDataType;
|
||||
|
||||
static constexpr bool AsyncCopy = Problem::AsyncCopy;
|
||||
|
||||
template <typename Problem>
|
||||
CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
|
||||
{
|
||||
@@ -156,17 +159,29 @@ struct TileCopy
|
||||
|
||||
if(my_id == warp_id)
|
||||
{
|
||||
// load from DRAM to registers
|
||||
load_tile(dram_tile, x_block_window);
|
||||
if constexpr(AsyncCopy)
|
||||
{
|
||||
async_load_tile(x_block_lds_window_no_dist, x_block_window);
|
||||
|
||||
// store in lds
|
||||
store_tile(x_block_lds_window_no_dist, dram_tile);
|
||||
load_tile(dram_tile, x_block_lds_window);
|
||||
|
||||
// read from lds to registers
|
||||
load_tile(dram_tile, x_block_lds_window);
|
||||
// store from registers to DRAM
|
||||
store_tile(y_block_window, dram_tile);
|
||||
}
|
||||
else
|
||||
{
|
||||
// load from DRAM to registers
|
||||
load_tile(dram_tile, x_block_window);
|
||||
|
||||
// store from registers to DRAM
|
||||
store_tile(y_block_window, dram_tile);
|
||||
// store in lds
|
||||
store_tile(x_block_lds_window_no_dist, dram_tile);
|
||||
|
||||
// read from lds to registers
|
||||
load_tile(dram_tile, x_block_lds_window);
|
||||
|
||||
// store from registers to DRAM
|
||||
store_tile(y_block_window, dram_tile);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
move_tile_window(x_block_window, {0, S::Block_N});
|
||||
|
||||
Reference in New Issue
Block a user