Improve the general performance of the Preshuffled GEMM V3 & delete the unnecessary instances (#2166)

* make the work compiled * Solved the example code, but still have the profiler error * Finished the feature * Clang format and update the CHANGELOG * solve the preshuffle v1 & v2 problem * Comment Addressed * Comment Addressed
2026-05-11 17:00:18 +00:00 · 2025-05-12 09:52:58 -07:00
parent 9d1e44e56a
commit b49f7de81f
11 changed files with 445 additions and 918 deletions
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
@@ -18,173 +18,6 @@ namespace device {
 namespace instance {

 #if(defined(CK_ENABLE_F16) || defined(CK_ENABLE_FP8))
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);

 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
@@ -268,174 +101,6 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16
 #endif

 #if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p1(
    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                     Col,
@@ -562,33 +227,6 @@ struct DeviceOperationInstanceFactory<
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                         is_same_v<CLayout, Row>)
            {
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
-                    op_ptrs);
-
                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
                    op_ptrs);
                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
@@ -612,33 +250,6 @@ struct DeviceOperationInstanceFactory<
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                         is_same_v<CLayout, Row>)
            {
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
-                    op_ptrs);
-
                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p1(
                    op_ptrs);
                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p2(