From 2b840f5a85528deb9c2fa64d1faca73004b4b4bc Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 18 Nov 2024 07:32:30 +0000 Subject: [PATCH] reduce prefetch stage in blockwisepipev4 --- .../blockwise_gemm_pipeline_xdlops_v4.hpp | 113 ++++++++---------- 1 file changed, 47 insertions(+), 66 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp index bd5a1bedf5..71bd972293 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp @@ -8,7 +8,7 @@ namespace ck { // Compute optimimal pipeline with highest resource request -// GlobalPrefetchStages: 4 +// GlobalPrefetchStages: 3 // LocalPreFillStages: 2 // LocalPreFetchStages: 1 // LocalSharedMemoryBuffer: 2 @@ -142,9 +142,9 @@ struct BlockwiseGemmXdlops_pipeline_v4 - __device__ static constexpr void HotLoopScheduler(ScheduleGroup schedule_group) + __device__ static constexpr void HotLoopScheduler() { // TODO: Take data type into consideration as pipe ver 3 // A-B splited schedule @@ -195,42 +194,42 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto idsread) { ignore = idsread; - __builtin_amdgcn_sched_group_barrier(0x100, 1, schedule_group); // DS read - __builtin_amdgcn_sched_group_barrier(0x008, 1, schedule_group); // MFMA + __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA }); static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) { ignore = idswrite; - __builtin_amdgcn_sched_group_barrier(0x200, 1, schedule_group); // DS write - __builtin_amdgcn_sched_group_barrier(0x008, 1, schedule_group); // MFMA + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA }); - __builtin_amdgcn_sched_group_barrier(0x020, 1, schedule_group); // VMEM read + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read __builtin_amdgcn_sched_group_barrier(0x008, num_mfma_per_issue - num_dsread_per_issue_a - num_dswrite_per_issue_a, - schedule_group); // MFMA + 0); // MFMA }); static_for<0, num_issue_b, 1>{}([&](auto i) { ignore = i; static_for<0, num_dsread_per_issue_b, 1>{}([&](auto idsread) { ignore = idsread; - __builtin_amdgcn_sched_group_barrier(0x100, 1, schedule_group); // DS read - __builtin_amdgcn_sched_group_barrier(0x008, 1, schedule_group); // MFMA + __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA }); static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) { ignore = idswrite; - __builtin_amdgcn_sched_group_barrier(0x200, 1, schedule_group); // DS write - __builtin_amdgcn_sched_group_barrier(0x008, 1, schedule_group); // MFMA + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA }); - __builtin_amdgcn_sched_group_barrier(0x020, 1, schedule_group); // VMEM read + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read __builtin_amdgcn_sched_group_barrier(0x008, num_mfma_per_issue - num_dsread_per_issue_a - num_dswrite_per_issue_b, - schedule_group); // MFMA + 0); // MFMA }); __builtin_amdgcn_sched_barrier(0); } @@ -274,26 +273,15 @@ struct BlockwiseGemmXdlops_pipeline_v4{}> b_thread_bufs; // Global prefetch 1 - a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0); - b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0); - - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); - b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); - - // Global prefetch 2 - a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1); - b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1); + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); // Local prefill 1 - a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0), I0); - b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I0), I0); - - // Local prefill 2 - a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1), I1); - b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I1), I1); + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0)); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I0)); // Local prefetch 1 block_sync_lds(); @@ -316,16 +304,20 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto k) { @@ -368,13 +358,11 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto k) { @@ -448,8 +434,8 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto k0) { static_for<0, MRepeat, 1>{}([&](auto m0) { @@ -479,13 +465,10 @@ struct BlockwiseGemmXdlops_pipeline_v4{}([&](auto k) { @@ -535,7 +518,7 @@ struct BlockwiseGemmXdlops_pipeline_v4