diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp index 9770e99738..5572de2657 100644 --- a/include/ck_tile/host/kernel_launch.hpp +++ b/include/ck_tile/host/kernel_launch.hpp @@ -24,7 +24,11 @@ __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu) #endif __global__ void kentry(Args... args) { +#if defined(__HIP_DEVICE_COMPILE__) Kernel{}(args...); +# else + (..., (ignore = args)); +#endif } // diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp index 648b2b85bd..71e33f0a75 100644 --- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -156,7 +156,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1 template CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp, - const AElementFunction& a_element_func, + [[maybe_unused]] const AElementFunction& a_element_func, const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp, index_t num_loop, void* p_smem) const @@ -249,7 +249,8 @@ struct FlatmmPipelineAGmemBGmemCRegV1 // prefetch // global read 0 - auto a_block_tile = load_tile(a_copy_dram_window); + //auto a_block_tile = load_tile(a_copy_dram_window); + // TODO: gino: async statically_indexed_array< statically_indexed_array, @@ -260,6 +261,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1 statically_indexed_array, NIterPerWarp> b_warp_tensor; + // gino: vgpr? statically_indexed_array< statically_indexed_array, @@ -274,6 +276,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter}); b_warp_tensor(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter)); + // gino: vgpr, so can't async? }); }); @@ -288,18 +291,25 @@ struct FlatmmPipelineAGmemBGmemCRegV1 tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); // LDS write 0 + // TODO: gino: Does command support shuffle and async load? if constexpr(std::is_same_v) { - auto a_shuffle_tmp = make_static_distributed_tensor( - PipelinePolicy::template MakeShuffledARegBlockDistribution()); - shuffle_tile(a_shuffle_tmp, a_block_tile); - const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp); - store_tile(a_copy_lds_window, a_block_tile_tmp); + assert(false); // not support + // auto a_shuffle_tmp = make_static_distributed_tensor( + // PipelinePolicy::template MakeShuffledARegBlockDistribution()); + // shuffle_tile(a_shuffle_tmp, a_block_tile); + // const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp); + // store_tile(a_copy_lds_window, a_block_tile_tmp); } else { - store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile)); + //store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile)); + + async_load_tile(a_copy_lds_window, a_copy_dram_window); + __builtin_amdgcn_s_waitcnt(3952); + //block_sync_lds(); } + // TODO: gino: use this to sync async load. block_sync_lds(); } @@ -307,7 +317,8 @@ struct FlatmmPipelineAGmemBGmemCRegV1 while(iCounter > 0) { // global read i + 1 - a_block_tile = load_tile(a_copy_dram_window); + // TODO: gino: async + //a_block_tile = load_tile(a_copy_dram_window); // GEMM i block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor); @@ -324,6 +335,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1 b_warp_tensor_2(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter)); }); }); + // gino: to vgpr // move to i + 2 move_tile_window(a_copy_dram_window, {0, kKPerBlock}); @@ -332,15 +344,19 @@ struct FlatmmPipelineAGmemBGmemCRegV1 move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock}); // LDS write i + 1 - auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile); - store_tile(a_copy_lds_window, a_block_tile_tmp); +// auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile); +// store_tile(a_copy_lds_window, a_block_tile_tmp); + // TODO: gino: combine with async? + async_load_tile(a_copy_lds_window, a_copy_dram_window); + __builtin_amdgcn_s_waitcnt(3952); HotLoopScheduler(); block_sync_lds(); // iCounter--; // global read i + 1 - a_block_tile = load_tile(a_copy_dram_window); + //a_block_tile = load_tile(a_copy_dram_window); + // TODO: gino: async // GEMM i block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor_2); @@ -355,6 +371,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter}); b_warp_tensor(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter)); + // TODO: gino: to vgpr, can't async }); }); @@ -365,9 +382,12 @@ struct FlatmmPipelineAGmemBGmemCRegV1 move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock}); // LDS write i + 1 - a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile); - store_tile(a_copy_lds_window, a_block_tile_tmp); + //a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile); + //store_tile(a_copy_lds_window, a_block_tile_tmp); + // gino: combine with async + async_load_tile(a_copy_lds_window, a_copy_dram_window); + __builtin_amdgcn_s_waitcnt(3952); HotLoopScheduler(); block_sync_lds(); @@ -377,7 +397,8 @@ struct FlatmmPipelineAGmemBGmemCRegV1 // tail { // global read i + 1 - a_block_tile = load_tile(a_copy_dram_window); + //a_block_tile = load_tile(a_copy_dram_window); + // TODO: gino: async // GEMM i block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor); @@ -392,6 +413,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter}); b_warp_tensor_2(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter)); + // gino: to vgpr, can't async }); }); @@ -399,12 +421,15 @@ struct FlatmmPipelineAGmemBGmemCRegV1 // move_tile_window(a_copy_dram_window, {0, kKPerBlock}); // LDS write i + 1 - const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile); - store_tile(a_copy_lds_window, a_block_tile_tmp); + //const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile); + //store_tile(a_copy_lds_window, a_block_tile_tmp); + // TODO: gino: combine with async // move to next flat K // move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock}); + async_load_tile(a_copy_lds_window, a_copy_dram_window); + __builtin_amdgcn_s_waitcnt(3952); HotLoopScheduler(); block_sync_lds();