support max3 in smoothquant and add+ rmsnorm + rdquant (#1654)

* Fix cmake example build * Support max3 in smoothquant one pass * support max3 in two pass * support max3 in add_rmsnorm_rdquant
2026-04-20 06:49:15 +00:00 · 2024-11-27 05:01:15 +08:00
parent bfe983a151
commit abae2afc72
5 changed files with 94 additions and 19 deletions
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_one_pass.hpp
@@ -25,6 +25,7 @@ struct SmoothquantPipelineOnePass
    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool UseMax3            = true; // TODO - Move to trait

    static constexpr const char* name = []() {
        if constexpr(kNeedCrossWarpSync)
@@ -52,7 +53,15 @@ struct SmoothquantPipelineOnePass
            xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());

        auto reduce_absmax_func  = ReduceOp::AbsMax{};
-        auto reduce_max_func     = ReduceOp::Max{};
+        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
+        auto reduce_max_func = ReduceOp::Max{};
+
        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
        auto block_reduce2d_cross_warp_sync =
@@ -68,8 +77,23 @@ struct SmoothquantPipelineOnePass
            xscale);

        // compute absmax, cross-lane->cross-warp
-        auto absmax = block_reduce2d(
-            y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+        auto absmax = [&]() {
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+            {
+                return block_reduce2d(y,
+                                      reduce_absmax_func.GetIdentityValue<ComputeDataType>(),
+                                      reduce_absmax3_func,
+                                      sequence<1, 2>{});
+            }
+            else
+            {
+                return block_reduce2d(
+                    y, reduce_absmax_func.GetIdentityValue<ComputeDataType>(), reduce_absmax_func);
+            }
+        }();
        block_reduce2d_sync(absmax, reduce_max_func);
        block_reduce2d_cross_warp_sync(absmax, smem, reduce_max_func);

--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
@@ -25,6 +25,7 @@ struct SmoothquantPipelineTwoPass
    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
    static constexpr bool kPadM              = false; // TODO - BlockSmoothquantProblem::kPadM
    static constexpr bool kPadN              = Problem::kPadN;
+    static constexpr bool UseMax3            = true; // TODO - Move to trait

    static constexpr const char* name = []() {
        if constexpr(kNeedCrossWarpSync)
@@ -56,6 +57,13 @@ struct SmoothquantPipelineTwoPass
            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));

        auto reduce_absmax_func  = ReduceOp::AbsMax{};
+        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
+            float rtn;
+            asm volatile("v_max3_f32 %0, %1, abs(%2), abs(%3)"
+                         : "=v"(rtn)
+                         : "v"(acc_), "v"(v_0_), "v"(v_1_));
+            return rtn;
+        };
        auto reduce_max_func     = ReduceOp::Max{};
        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
@@ -77,7 +85,13 @@ struct SmoothquantPipelineTwoPass
                x,
                xscale);

-            block_reduce2d(y, absmax, reduce_absmax_func);
+            constexpr auto x_size_per_row =
+                x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
+            if constexpr(UseMax3 && std::is_same_v<ComputeDataType, float> &&
+                         x_size_per_row % 2 == 0)
+                block_reduce2d(y, absmax, reduce_absmax3_func, sequence<1, 2>{});
+            else
+                block_reduce2d(y, absmax, reduce_absmax_func);

            move_tile_window(x_window, {0, Block_N});
            move_tile_window(xscale_window, {Block_N});