embedding fuse layernorm (#405)

* add gridwise/device sparse embedding * update code * update code * remove useless makefile * code fix * workable * work properly * emb add * add more instance * format * remove useless code * fix format * fix clang-tidy * clean * fix a compile error Co-authored-by: Chao Liu <chao.liu2@amd.com> Co-authored-by: Chao Liu <lc.roy86@gmail.com> [ROCm/composable_kernel commit: efd1d25733]
2026-07-19 02:01:01 +00:00 · 2022-09-09 23:41:15 +08:00
parent a5c5857a42
commit 8d4fd1233d
7 changed files with 998 additions and 0 deletions
--- a/include/ck/tensor_operation/gpu/device/device_sparse_embedding3_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_sparse_embedding3_forward_layernorm.hpp
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType,
+          ck::index_t BlockSize,
+          ck::index_t DimClusterSize,
+          ck::index_t RowClusterSize,
+          ck::index_t DimPerBlock,
+          ck::index_t RowPerBlock,
+          ck::index_t DimThreadSize,
+          ck::index_t RowVectorSize>
+struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator
+{
+
+    static auto MakeOutputDescriptor(const index_t index_length, const index_t rows)
+    {
+        return make_naive_tensor_descriptor_packed(make_tuple(index_length, rows));
+    }
+
+    struct Argument : public BaseArgument
+    {
+        Argument(OutType* p_out,
+                 const EmbType* p_emb_a,
+                 const EmbType* p_emb_b,
+                 const EmbType* p_emb_c,
+                 const IndexType* p_index_a,
+                 const IndexType* p_index_b,
+                 const IndexType* p_index_c,
+                 const GammaDataType* p_gamma,
+                 const BetaDataType* p_beta,
+                 const ck::index_t NumRows,
+                 const ck::index_t EmbeddingDim,
+                 const ck::index_t IndexLength,
+                 const AccDataType epsilon)
+            : p_out_(p_out),
+              p_emb_a_(p_emb_a),
+              p_emb_b_(p_emb_b),
+              p_emb_c_(p_emb_c),
+              p_index_a_(p_index_a),
+              p_index_b_(p_index_b),
+              p_index_c_(p_index_c),
+              p_gamma_(p_gamma),
+              p_beta_(p_beta),
+              NumRows_(NumRows),
+              EmbeddingDim_(EmbeddingDim),
+              IndexLength_(IndexLength),
+              epsilon_(epsilon)
+        {
+            grid_size_ = (IndexLength + DimClusterSize - 1) / DimClusterSize;
+        }
+
+        OutType* p_out_;
+        const EmbType* p_emb_a_;
+        const EmbType* p_emb_b_;
+        const EmbType* p_emb_c_;
+        const IndexType* p_index_a_;
+        const IndexType* p_index_b_;
+        const IndexType* p_index_c_;
+        const GammaDataType* p_gamma_;
+        const BetaDataType* p_beta_;
+        ck::index_t NumRows_;
+        ck::index_t EmbeddingDim_;
+        ck::index_t IndexLength_;
+        AccDataType epsilon_;
+
+        size_t grid_size_;
+    };
+
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(void* p_out,
+                                                              const void* p_emb_a,
+                                                              const void* p_emb_b,
+                                                              const void* p_emb_c,
+                                                              const void* p_index_a,
+                                                              const void* p_index_b,
+                                                              const void* p_index_c,
+                                                              const void* p_gamma,
+                                                              const void* p_beta,
+                                                              ck::index_t NumRows,
+                                                              ck::index_t EmbeddingDim,
+                                                              ck::index_t IndexLength,
+                                                              const AccDataType epsilon)
+    {
+        return std::make_unique<Argument>(reinterpret_cast<OutType*>(p_out),
+                                          reinterpret_cast<const EmbType*>(p_emb_a),
+                                          reinterpret_cast<const EmbType*>(p_emb_b),
+                                          reinterpret_cast<const EmbType*>(p_emb_c),
+                                          reinterpret_cast<const IndexType*>(p_index_a),
+                                          reinterpret_cast<const IndexType*>(p_index_b),
+                                          reinterpret_cast<const IndexType*>(p_index_c),
+                                          reinterpret_cast<const GammaDataType*>(p_gamma),
+                                          reinterpret_cast<const BetaDataType*>(p_beta),
+                                          NumRows,
+                                          EmbeddingDim,
+                                          IndexLength,
+                                          epsilon);
+    }
+
+    using GridwiseSparseEmbedding =
+        GridwiseSparseEmbedding3ForwardLayernorm<EmbType,
+                                                 IndexType,
+                                                 GammaDataType,
+                                                 BetaDataType,
+                                                 AccDataType,
+                                                 OutType,
+                                                 decltype(MakeOutputDescriptor(1, 1)),
+                                                 BlockSize,
+                                                 DimClusterSize,
+                                                 RowClusterSize,
+                                                 DimPerBlock,
+                                                 RowPerBlock,
+                                                 DimThreadSize,
+                                                 RowVectorSize>;
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            auto out_desc = MakeOutputDescriptor(arg.IndexLength_, arg.EmbeddingDim_);
+            const auto kernel_main =
+                kernel_sparse_embedding3_forward_layernorm<GridwiseSparseEmbedding,
+                                                           EmbType,
+                                                           IndexType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           AccDataType,
+                                                           OutType,
+                                                           decltype(out_desc)>;
+            float avg_time = 0;
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.grid_size_),
+                                               dim3(BlockSize),
+                                               0,
+                                               arg.p_out_,
+                                               arg.p_emb_a_,
+                                               arg.p_emb_b_,
+                                               arg.p_emb_c_,
+                                               arg.p_index_a_,
+                                               arg.p_index_b_,
+                                               arg.p_index_c_,
+                                               arg.p_gamma_,
+                                               arg.p_beta_,
+                                               out_desc,
+                                               arg.epsilon_);
+
+            return (avg_time);
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    static bool IsSupportedArgument(const Argument* p_arg)
+    {
+        return (RowPerBlock == p_arg->EmbeddingDim_) && (p_arg->NumRows_ % DimPerBlock == 0);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(dynamic_cast<const Argument*>(p_arg));
+    }
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>();
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceSparseEmbedding3ForwardLayernorm_"<< BlockSize << "_" <<
+            DimClusterSize << "x" << RowClusterSize << "_" <<
+            DimPerBlock << "x" << RowPerBlock << "_" <<
+            DimThreadSize << "x" << RowVectorSize;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+
+namespace ck {
+
+template <typename GridwiseSparseEmbedding,
+          typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType,
+          typename OutGridDesc>
+#if CK_USE_LAUNCH_BOUNDS
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+    __global__ void kernel_sparse_embedding3_forward_layernorm(OutType* p_out,
+                                                               const EmbType* p_emb_a,
+                                                               const EmbType* p_emb_b,
+                                                               const EmbType* p_emb_c,
+                                                               const IndexType* p_index_a,
+                                                               const IndexType* p_index_b,
+                                                               const IndexType* p_index_c,
+                                                               const GammaDataType* p_gamma,
+                                                               const BetaDataType* p_beta,
+                                                               const OutGridDesc out_grid_desc,
+                                                               const AccDataType epsilon)
+{
+    GridwiseSparseEmbedding::Run(p_out,
+                                 p_emb_a,
+                                 p_emb_b,
+                                 p_emb_c,
+                                 p_index_a,
+                                 p_index_b,
+                                 p_index_c,
+                                 p_gamma,
+                                 p_beta,
+                                 out_grid_desc,
+                                 epsilon);
+}
+
+template <typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType,
+          typename OutGridDesc,
+          ck::index_t BlockSize,
+          ck::index_t DimClusterSize,
+          ck::index_t RowClusterSize,
+          ck::index_t DimPerBlock,   // Row x Dim, along Dim
+          ck::index_t RowPerBlock,   // Row x Dim, along Row
+          ck::index_t DimThreadSize, // this is actually not vector, but number of registers
+          ck::index_t RowVectorSize>
+struct GridwiseSparseEmbedding3ForwardLayernorm
+{
+    static constexpr auto I0          = Number<0>{};
+    static constexpr auto I1          = Number<1>{};
+    static constexpr auto I2          = Number<2>{};
+    static constexpr auto I3          = Number<3>{};
+    static constexpr index_t WaveSize = 64;
+
+    static_assert(BlockSize == RowClusterSize * DimClusterSize,
+                  "Invalid cluster distribution within block");
+    static_assert(RowClusterSize % WaveSize == 0, "need to be wavewise");
+
+    static_assert(DimPerBlock % (DimClusterSize * DimThreadSize) == 0, "");
+    static_assert(RowPerBlock % (RowClusterSize * RowVectorSize) == 0, "");
+
+    static constexpr auto DimSubBlocks = DimPerBlock / (DimClusterSize * DimThreadSize);
+    static constexpr auto RowSubBlocks = RowPerBlock / (RowClusterSize * RowVectorSize);
+
+    static_assert((DimPerBlock % DimSubBlocks == 0) && (RowPerBlock % RowSubBlocks == 0), "");
+    static constexpr auto DimPerSubBlock = DimPerBlock / DimSubBlocks;
+    static constexpr auto RowPerSubBlock = RowPerBlock / RowSubBlocks;
+
+    using ThreadwiseWolfordDesc2D = decltype(make_naive_tensor_descriptor_packed(make_tuple(
+        Number<DimSubBlocks * DimThreadSize>{}, Number<RowSubBlocks * RowVectorSize>{})));
+
+    using ThreadwiseWolfordDescReduce = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(Number<DimSubBlocks * DimThreadSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadwiseWolfordDesc2D, ThreadwiseWolfordDescReduce>;
+
+    using ThreadClusterLength = Sequence<DimClusterSize, RowClusterSize>;
+
+    using BlockwiseWelford =
+        BlockwiseWelford<AccDataType, BlockSize, ThreadClusterLength, Sequence<0, 1>>;
+
+    __device__ static void Run(OutType* p_out,
+                               const EmbType* p_emb_a,
+                               const EmbType* p_emb_b,
+                               const EmbType* p_emb_c,
+                               const IndexType* p_index_a,
+                               const IndexType* p_index_b,
+                               const IndexType* p_index_c,
+                               const GammaDataType* p_gamma,
+                               const BetaDataType* p_beta,
+                               const OutGridDesc,
+                               const AccDataType epsilon)
+    {
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        // const auto index_length = out_grid_desc.GetLength(I0);
+        // const auto emb_dim      = out_grid_desc.GetLength(I1);
+
+        constexpr auto thread_cluster_desc =
+            make_cluster_descriptor(Sequence<DimClusterSize, RowClusterSize>{}, Sequence<0, 1>{});
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_dim_cluster_id = thread_cluster_idx[I0];
+        const auto thread_row_cluster_id = thread_cluster_idx[I1];
+
+        const auto wave_dim_id = __builtin_amdgcn_readfirstlane(thread_dim_cluster_id / WaveSize);
+
+        const auto index_start = block_global_id * DimPerBlock + wave_dim_id * DimThreadSize;
+
+        auto threadwise_welford       = ThreadwiseWelford();
+        threadwise_welford.max_count_ = RowSubBlocks * RowVectorSize;
+
+        constexpr auto thread_buf_size =
+            DimSubBlocks * DimThreadSize * RowSubBlocks * RowVectorSize;
+        constexpr auto thread_buf_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(DimSubBlocks, DimThreadSize, RowSubBlocks, RowVectorSize));
+        constexpr auto mean_var_buf_size = DimSubBlocks * DimThreadSize;
+        constexpr auto mean_var_buf_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(DimSubBlocks, DimThreadSize));
+        constexpr auto gamma_beta_buf_size = RowSubBlocks * RowVectorSize;
+        constexpr auto gamma_beta_buf_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(RowSubBlocks, RowVectorSize));
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_a;
+        StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_b;
+        StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_c;
+
+        StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_a;
+        StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_b;
+        StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_c;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, thread_buf_size, true> acc_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, gamma_beta_buf_size, true>
+            gamma_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, gamma_beta_buf_size, true>
+            beta_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, mean_var_buf_size, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, mean_var_buf_size, true> var_thread_buf;
+
+        auto load_current_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            vector_type_maker_t<EmbType, RowVectorSize> emb_vector_a;
+            vector_type_maker_t<EmbType, RowVectorSize> emb_vector_b;
+            vector_type_maker_t<EmbType, RowVectorSize> emb_vector_c;
+
+            using src_vector_t = typename decltype(emb_vector_a)::type;
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                constexpr auto current_dim = i_dim_sub_ * DimPerSubBlock + i_dim_vec_;
+                IndexType index_a          = index_buf_a[Number<current_dim>{}];
+                IndexType index_b          = index_buf_b[Number<current_dim>{}];
+                IndexType index_c          = index_buf_c[Number<current_dim>{}];
+
+                auto thread_offset = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                     sizeof(EmbType) * RowVectorSize;
+
+                int32x4_t emb_res_a =
+                    make_wave_buffer_resource_with_default_range(p_emb_a + index_a * RowPerBlock);
+                int32x4_t emb_res_b =
+                    make_wave_buffer_resource_with_default_range(p_emb_b + index_b * RowPerBlock);
+                int32x4_t emb_res_c =
+                    make_wave_buffer_resource_with_default_range(p_emb_c + index_c * RowPerBlock);
+                emb_vector_a.template AsType<src_vector_t>()(I0) =
+                    amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_a, thread_offset, 0);
+                emb_vector_b.template AsType<src_vector_t>()(I0) =
+                    amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_b, thread_offset, 0);
+                emb_vector_c.template AsType<src_vector_t>()(I0) =
+                    amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_c, thread_offset, 0);
+
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    in_thread_buf_a(Number<register_offset>{}) =
+                        emb_vector_a.template AsType<EmbType>()[i_row_vec_];
+                    in_thread_buf_b(Number<register_offset>{}) =
+                        emb_vector_b.template AsType<EmbType>()[i_row_vec_];
+                    in_thread_buf_c(Number<register_offset>{}) =
+                        emb_vector_c.template AsType<EmbType>()[i_row_vec_];
+                });
+            });
+        };
+
+        auto accumulate_current_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    AccDataType va =
+                        ck::type_convert<AccDataType>(in_thread_buf_a(Number<register_offset>{}));
+                    AccDataType vb =
+                        ck::type_convert<AccDataType>(in_thread_buf_b(Number<register_offset>{}));
+                    AccDataType vc =
+                        ck::type_convert<AccDataType>(in_thread_buf_c(Number<register_offset>{}));
+
+                    acc_thread_buf(Number<register_offset>{}) += va + vb + vc;
+                });
+            });
+        };
+
+        auto threadwise_welford_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    constexpr auto mean_var_offset =
+                        mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_));
+
+                    threadwise_welford.cur_count_++;
+                    threadwise_welford.Update(mean_thread_buf(Number<mean_var_offset>{}),
+                                              var_thread_buf(Number<mean_var_offset>{}),
+                                              acc_thread_buf(Number<register_offset>{}));
+                });
+            });
+        };
+
+        auto threadwise_normalize_store_out = [&](auto i_dim_sub_, auto i_row_sub_) {
+            int32x4_t out_res =
+                make_wave_buffer_resource_with_default_range(p_out + index_start * RowPerBlock);
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                vector_type_maker_t<OutType, RowVectorSize> out_vector;
+                using dst_vector_t = typename decltype(out_vector)::type;
+
+                constexpr auto mean_var_offset =
+                    mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_));
+
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    constexpr auto gamma_beta_offset =
+                        gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_));
+
+                    auto acc_val = acc_thread_buf[Number<register_offset>{}];
+                    acc_val      = (acc_val - mean_thread_buf(Number<mean_var_offset>{})) /
+                              sqrt(var_thread_buf(Number<mean_var_offset>{}) + epsilon);
+                    acc_val = acc_val * gamma_thread_buf[Number<gamma_beta_offset>{}] +
+                              beta_thread_buf[Number<gamma_beta_offset>{}];
+
+                    out_vector.template AsType<OutType>()(Number<i_row_vec_>{}) =
+                        type_convert<OutType>(acc_val);
+                });
+
+                index_t thread_offset = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                        sizeof(OutType) * RowVectorSize;
+
+                amd_buffer_store_impl<OutType, RowVectorSize>(
+                    out_vector.template AsType<dst_vector_t>()[Number<0>{}],
+                    out_res,
+                    thread_offset,
+                    0);
+            });
+        };
+
+        // first load index
+        ck::static_for<0, DimPerBlock, 1>{}([&](auto i_idx_) {
+            // prefer use s_load
+            index_buf_a(i_idx_) = p_index_a[index_start + i_idx_.value];
+            index_buf_b(i_idx_) = p_index_b[index_start + i_idx_.value];
+            index_buf_c(i_idx_) = p_index_c[index_start + i_idx_.value];
+        });
+
+        // load gamma/beta
+        static_for<0, RowSubBlocks, 1>{}([&](auto i_row_sub_) {
+            vector_type_maker_t<GammaDataType, RowVectorSize> gamma_vector;
+            vector_type_maker_t<BetaDataType, RowVectorSize> beta_vector;
+
+            index_t thread_offset_gamma = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                          sizeof(GammaDataType) * RowVectorSize;
+            index_t thread_offset_beta = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                         sizeof(BetaDataType) * RowVectorSize;
+
+            int32x4_t gamma_res = make_wave_buffer_resource_with_default_range(p_gamma);
+            int32x4_t beta_res  = make_wave_buffer_resource_with_default_range(p_beta);
+
+            gamma_vector.template AsType<typename decltype(gamma_vector)::type>()(I0) =
+                amd_buffer_load_impl<GammaDataType, RowVectorSize>(
+                    gamma_res, thread_offset_gamma, 0);
+            beta_vector.template AsType<typename decltype(beta_vector)::type>()(I0) =
+                amd_buffer_load_impl<BetaDataType, RowVectorSize>(beta_res, thread_offset_beta, 0);
+
+            static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                constexpr auto offset =
+                    gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_));
+                gamma_thread_buf(Number<offset>{}) = type_convert<AccDataType>(
+                    gamma_vector.template AsType<GammaDataType>()[Number<i_row_vec_>{}]);
+                beta_thread_buf(Number<offset>{}) = type_convert<AccDataType>(
+                    beta_vector.template AsType<BetaDataType>()[Number<i_row_vec_>{}]);
+            });
+        });
+
+        static_for<0, thread_buf_size, 1>{}(
+            [&](auto I) { acc_thread_buf(I) = type_convert<AccDataType>(0.0f); });
+
+        static_for<0, mean_var_buf_size, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        static_for<0, DimSubBlocks, 1>{}([&](auto i_dim_sub) {
+            load_current_sub_row(i_dim_sub, Number<0>{});
+            static_for<0, RowSubBlocks - 1, 1>{}([&](auto i_row) {
+                load_current_sub_row(i_dim_sub, Number<1>{} + i_row);
+                accumulate_current_sub_row(i_dim_sub, i_row);
+                threadwise_welford_sub_row(i_dim_sub, i_row);
+            });
+            accumulate_current_sub_row(i_dim_sub, Number<RowSubBlocks - 1>{});
+            threadwise_welford_sub_row(i_dim_sub, Number<RowSubBlocks - 1>{});
+
+            // blockwise welford
+            static_for<0, mean_var_buf_size, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();
+
+                BlockwiseWelford::Run(
+                    mean_thread_buf(I), var_thread_buf(I), threadwise_welford.cur_count_);
+            });
+
+            // store
+            static_for<0, RowSubBlocks, 1>{}(
+                [&](auto i_row) { threadwise_normalize_store_out(i_dim_sub, i_row); });
+        });
+    }
+};
+
+} // namespace ck
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -34,6 +34,21 @@ __device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_
    return wave_buffer_resource.content;
 }

+template <typename T>
+__device__ int32x4_t make_wave_buffer_resource_with_default_range(T* p_wave)
+{
+    BufferResource<T> wave_buffer_resource;
+
+    // wavewise base address (64 bit)
+    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
+    // wavewise range (32 bit)
+    wave_buffer_resource.range(Number<2>{}) = 0xffffffff; // max possible range
+    // wavewise setting (32 bit)
+    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
+
+    return wave_buffer_resource.content;
+}
+
 // buffer load i8
 __device__ int8_t
 llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,