Add gridwise GEMM pipeline (#89)

* clean up * add mutilple thread scratch to ThreadwiseTensorSliceTransfer_v3r1 * add 2 stage prefetch * add more sanity check into transform_tensor_descriptor * tweak * enabling 2 stage prefetch to exsiting gridwise gemm; tweak * enabling 2 stage prefetch to exsiting gridwise gemm * move gridwise gemm pipeline in class; clean up * add some irregular tile size * update CalculateHasMainK0BlockLoop for multi-stage-prefetch * refactor gridwise gemm pipeline class
2026-05-13 01:36:06 +00:00 · 2022-02-23 17:23:49 -06:00
parent 756a761727
commit 22d438ae9e
18 changed files with 864 additions and 1694 deletions
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include <iomanip>
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
@@ -30,6 +31,9 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(std::vector<De
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);

+void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<DeviceGemmNoOpPtr>&);
+
 void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
 void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
 void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
@@ -225,6 +229,9 @@ void profile_gemm_impl(int do_verification,

            ck::tensor_operation::device::device_gemm_instance::
                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs);
+
+            ck::tensor_operation::device::device_gemm_instance::
+                add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs);
        }
        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
                          is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
@@ -293,8 +300,8 @@ void profile_gemm_impl(int do_verification,

            float gb_per_sec = num_btype / 1.E6 / ave_time;

-            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s, " << gemm_name << std::endl;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << gemm_name << std::endl;

            if(tflops > best_tflops)
            {