BatchNorm backward implementation (#461)

* Implemented batchnorm-backward Blockwise and Multiblock kernels * Add batchnorm-backward device op * Add batchnorm-backward host-reference op * Add batchnorm-backward example * Parameters renaming in batchnorm backward kernels and device op * Change in the example to loose the threshold for ScaleDiff checking * Add comments to explain the implementation of batchnorm-backward * Parameters renaming again in batchnorm backward kernels * Improve the expression calculation for performance * Add batchnorm backward to README * Add comments to explain inv-variance in batchnorm forward and backward * Renaming the batchnorm forward training and inferring examples * Add/update the comments for batchnorm-backward kernels * Renaming again * Add block_sync_lds between two consecutive blockwise reductions * Move common expression 1/N out of the static_for loops * Add dy_elementwise_op * Renaming in backward example again * Add checking for reduceDims in reference_batchnorm_backward * Update to comments and codes format * Rename in the comments * Remove common expression out of the loop in reference_batchnorm_backward_nhwc_c * Add block_sync_lds() between blockwise reduction again * Fix comments again * Remove int8 from batchnorm-forward instances since it is not needed for forward training and could fail test
2026-04-19 22:39:03 +00:00 · 2022-11-29 10:51:10 +08:00
parent 5bf0475afd
commit 44789d992a
15 changed files with 3710 additions and 2 deletions
--- a/example/34_batchnorm/CMakeLists.txt
+++ b/example/34_batchnorm/CMakeLists.txt
@@ -1,2 +1,3 @@
-add_example_executable(example_batchnorm_forward batchnorm_forward_nhwc.cpp)
-add_example_executable(example_batchnorm_infer batchnorm_infer_nhwc.cpp)
+add_example_executable(example_batchnorm_forward_training batchnorm_forward_training_nhwc.cpp)
+add_example_executable(example_batchnorm_forward_inferring batchnorm_forward_inferring_nhwc.cpp)
+add_example_executable(example_batchnorm_backward batchnorm_backward_nhwc.cpp)
--- a/example/34_batchnorm/README.md
+++ b/example/34_batchnorm/README.md
@@ -53,4 +53,29 @@ Start running 10 times...
 Perf: 1.28235 ms, 523.329 GB/s
 ```

+## Run ```batchnorm backward nhwc```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
+Arg2 -- 1/0 to indicate whether to use saved mean and invVariance
+Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+Arg4 -- time kernel (0=no, 1=yes)
+Arg5: use multi-block welford (0=n0, 1=yes)
+./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1
+```

+Result 
+```
+./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1
+launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 0.411026 ms, 91.8702 GB/s
+```
--- a/example/34_batchnorm/batchnorm_backward_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <limits>
+#include <iostream>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp"
+
+static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class BatchNormBwdArg
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inOutLengths;
+
+    bool do_verification = false;
+
+    bool haveSavedMeanInvVar;
+
+    int data_type               = 0;
+    int init_method             = 3;
+    bool time_kernel            = false;
+    bool use_multiblock_welford = false;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        // clang-format off
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2 -- 1/0 to indicate whether to use saved mean and invVariance" << std::endl;
+        std::cout << "Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
+        std::cout << "Arg4 -- time kernel (0=no, 1=yes)" << std::endl;
+        std::cout << "Arg5: use multi-block welford (0=n0, 1=yes)" << std::endl;
+        // clang-format on
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inOutLengths = getTypeValuesFromString<size_t>(optarg);
+
+                if(inOutLengths.size() != 4)
+                    throw std::runtime_error(
+                        "NHWC tensor layout should have 4 length values specified!");
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 5 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type              = std::atoi(argv[optind++]);
+        haveSavedMeanInvVar    = std::atoi(argv[optind++]);
+        init_method            = std::atoi(argv[optind++]);
+        time_kernel            = static_cast<bool>(std::atoi(argv[optind++]));
+        use_multiblock_welford = static_cast<bool>(std::atoi(argv[optind]));
+
+        return (0);
+    };
+};
+
+using namespace ck;
+
+template <typename InOutDataType, typename AccDataType, bool UseMultiblockInK>
+bool bnorm_bwd_nhwc_test(bool do_verification,
+                         int init_method,
+                         bool time_kernel,
+                         const std::vector<size_t> inOutLengths,
+                         bool haveSavedMeanInvVar,
+                         double epsilon)
+{
+    // for NHWC BatchNorm calculation of mean and meansquare
+    constexpr index_t Rank         = 4;
+    constexpr index_t NumReduceDim = 3;
+
+    const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
+
+    // input data of the batchnorm backward algorithm
+    Tensor<InOutDataType> x(inOutLengths);
+    Tensor<InOutDataType> dy(inOutLengths);
+
+    Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
+
+    Tensor<AccDataType> savedMean(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> savedInvVar(scaleBiasMeanVarLengths);
+    // savedVariance is only used for initializing savedInvVar
+    Tensor<AccDataType> savedVariance(scaleBiasMeanVarLengths);
+
+    // output data of the batchnorm backward algorithm
+    Tensor<InOutDataType> dx_ref(inOutLengths);
+    Tensor<InOutDataType> dx(inOutLengths);
+
+    Tensor<AccDataType> dscale(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> dbias(scaleBiasMeanVarLengths);
+
+    Tensor<AccDataType> dscale_ref(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> dbias_ref(scaleBiasMeanVarLengths);
+
+    auto inOutStrides            = dy.mDesc.GetStrides();
+    auto scaleBiasMeanVarStrides = dscale.mDesc.GetStrides();
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(haveSavedMeanInvVar)
+    {
+        const float x_mean       = 0.0f;
+        const float x_stddev     = 1.0f;
+        const float noise_stddev = 0.0001f;
+
+        // input data in normal distribution
+        x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
+
+        // initialize the savedMean to be values with tiny variation to the mean of the x values
+        savedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
+                                      num_thread);
+
+        // initialize the variance to be values with tiny variation to the variance of the x values
+        savedVariance.GenerateTensorValue(
+            GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+
+        auto it_src       = savedVariance.mData.begin();
+        auto it_dst       = savedInvVar.mData.begin();
+        float tmp_epsilon = std::numeric_limits<float>::epsilon();
+
+        while(it_src != savedVariance.mData.end())
+        {
+            *it_dst = type_convert<AccDataType>(
+                1.0f / std::sqrtf(type_convert<float>(*it_src) + tmp_epsilon));
+
+            it_src++;
+            it_dst++;
+        };
+    }
+    else
+    {
+        const float x_mean   = 0.0f;
+        const float x_stddev = 1.0f;
+
+        // input data in normal distribution
+        x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
+    };
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            dy.GenerateTensorValue(GeneratorTensor_0<InOutDataType>{}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_0<InOutDataType>{}, num_thread);
+            break;
+        case 1:
+            dy.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            dy.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            dy.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-0.2f, 0.2f}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-0.5f, 0.5f}, num_thread);
+        }
+    };
+
+    // input data of the batchnorm backward algorithm
+    DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem dy_dev(sizeof(InOutDataType) * dy.mDesc.GetElementSpaceSize());
+
+    DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
+
+    DeviceMem savedMean_dev(sizeof(AccDataType) * savedMean.mDesc.GetElementSpaceSize());
+    DeviceMem savedInvVar_dev(sizeof(AccDataType) * savedInvVar.mDesc.GetElementSpaceSize());
+
+    // output data of the batchnorm backward algorithm
+    DeviceMem dx_dev(sizeof(InOutDataType) * dx.mDesc.GetElementSpaceSize());
+
+    DeviceMem dscale_dev(sizeof(AccDataType) * dscale.mDesc.GetElementSpaceSize());
+    DeviceMem dbias_dev(sizeof(AccDataType) * dbias.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    dy_dev.ToDevice(dy.mData.data());
+    bnScale_dev.ToDevice(bnScale.mData.data());
+
+    if(haveSavedMeanInvVar)
+    {
+        savedMean_dev.ToDevice(savedMean.mData.data());
+        savedInvVar_dev.ToDevice(savedInvVar.mData.data());
+    };
+
+    std::array<index_t, Rank> i_inOutLengths;
+    std::array<index_t, Rank> i_inOutStrides;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
+
+    std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
+    std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
+    std::copy(scaleBiasMeanVarLengths.begin(),
+              scaleBiasMeanVarLengths.end(),
+              i_scaleBiasMeanVarLengths.begin());
+    std::copy(scaleBiasMeanVarStrides.begin(),
+              scaleBiasMeanVarStrides.end(),
+              i_scaleBiasMeanVarStrides.begin());
+
+    using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+
+    using DeviceBatchNormBwdInstance =
+        ck::tensor_operation::device::DeviceBatchNormBwdImpl<InOutDataType,
+                                                             InOutDataType,
+                                                             InOutDataType,
+                                                             AccDataType,
+                                                             AccDataType, // ScaleDataType
+                                                             AccDataType, // BiasDataType
+                                                             AccDataType, // MeanVarDataType
+                                                             PassThroughOp,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             UseMultiblockInK,
+                                                             256,
+                                                             16,
+                                                             16,
+                                                             1,
+                                                             2,
+                                                             0,
+                                                             1,  // XSrcVectorSize
+                                                             1,  // DySrcVectorSize
+                                                             1,  // DxDstVectorSize
+                                                             1,  // ScaleSrcDstVectorSize
+                                                             1,  // BiasDstVectorSize
+                                                             1>; // MeanVarSrcVectorSize
+
+    auto batchnorm_bwd = DeviceBatchNormBwdInstance{};
+
+    auto argument_ptr = batchnorm_bwd.MakeArgumentPointer(
+        i_inOutLengths,
+        i_inOutStrides,
+        i_inOutStrides,
+        i_inOutStrides,
+        {0, 1, 2},
+        i_scaleBiasMeanVarLengths,
+        i_scaleBiasMeanVarStrides,
+        i_scaleBiasMeanVarStrides,
+        i_scaleBiasMeanVarStrides,
+        x_dev.GetDeviceBuffer(),
+        dy_dev.GetDeviceBuffer(),
+        bnScale_dev.GetDeviceBuffer(),
+        haveSavedMeanInvVar ? savedMean_dev.GetDeviceBuffer() : nullptr,
+        haveSavedMeanInvVar ? savedInvVar_dev.GetDeviceBuffer() : nullptr,
+        epsilon,
+        PassThroughOp{},
+        dx_dev.GetDeviceBuffer(),
+        dscale_dev.GetDeviceBuffer(),
+        dbias_dev.GetDeviceBuffer());
+
+    if(!batchnorm_bwd.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters seems not supported by the BatchNorm device instance, "
+                     "exiting!"
+                  << std::endl;
+        return (false);
+    };
+
+    size_t workspace_sz = batchnorm_bwd.GetWorkSpaceSize(argument_ptr.get());
+
+    DeviceMem workspace_dev(workspace_sz);
+
+    batchnorm_bwd.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+    auto invoker_ptr = batchnorm_bwd.MakeInvokerPointer();
+
+    if(time_kernel)
+    {
+        float avg_time   = 0.0f;
+        size_t num_bytes = 0;
+
+        size_t total_length = inOutLengths[0] * inOutLengths[1] * inOutLengths[2] * inOutLengths[3];
+        size_t invariant_length = inOutLengths[3];
+
+        avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        // inputing of x, dy, scale, outputing of dx, dscale, dbias
+        num_bytes +=
+            total_length * sizeof(InOutDataType) * 3 + invariant_length * sizeof(AccDataType) * 3;
+
+        // outputing of mean, inv-variance
+        num_bytes += haveSavedMeanInvVar ? invariant_length * sizeof(AccDataType) * 2 : 0;
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    }
+    else
+        (void)invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        using ReferenceBatchNormBwdInstance =
+            ck::tensor_operation::host::ReferenceBatchNormBwd_Input_N_H_W_C_Output_C<InOutDataType,
+                                                                                     InOutDataType,
+                                                                                     InOutDataType,
+                                                                                     AccDataType,
+                                                                                     AccDataType,
+                                                                                     AccDataType,
+                                                                                     AccDataType,
+                                                                                     PassThroughOp>;
+
+        auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{};
+
+        auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer(
+            i_inOutLengths,
+            i_inOutStrides,
+            i_inOutStrides,
+            i_inOutStrides,
+            {0, 1, 2},
+            i_scaleBiasMeanVarLengths,
+            i_scaleBiasMeanVarStrides,
+            i_scaleBiasMeanVarStrides,
+            i_scaleBiasMeanVarStrides,
+            x.mData.data(),
+            dy.mData.data(),
+            bnScale.mData.data(),
+            haveSavedMeanInvVar ? savedMean.mData.data() : nullptr,
+            haveSavedMeanInvVar ? savedInvVar.mData.data() : nullptr,
+            epsilon,
+            PassThroughOp{},
+            dx_ref.mData.data(),
+            dscale_ref.mData.data(),
+            dbias_ref.mData.data());
+
+        if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout
+                << "The runtime parameters seems not supported by the device instance, exiting!"
+                << std::endl;
+            return (false);
+        };
+
+        auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+
+        dx_dev.FromDevice(dx.mData.data());
+        dscale_dev.FromDevice(dscale.data());
+        dbias_dev.FromDevice(dbias.data());
+
+        // clang-format off
+        pass = pass && ck::utils::check_err(dbias.mData, dbias_ref.mData, "dBias result:", 1e-5, 1e-5);
+        pass = pass && ck::utils::check_err(dscale.mData, dscale_ref.mData, "dScale result:", 1e-5, 2e-4);
+        pass = pass && ck::utils::check_err(dx.mData, dx_ref.mData, "dx result:");
+        // clang-format on
+    };
+
+    return (pass);
+};
+
+static const double epsilon = std::numeric_limits<float>::epsilon();
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        BatchNormBwdArg arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_bwd_nhwc_test<ck::half_t, float, true>(arg.do_verification,
+                                                                    arg.init_method,
+                                                                    arg.time_kernel,
+                                                                    arg.inOutLengths,
+                                                                    arg.haveSavedMeanInvVar,
+                                                                    epsilon);
+            else
+                pass = bnorm_bwd_nhwc_test<ck::half_t, float, false>(arg.do_verification,
+                                                                     arg.init_method,
+                                                                     arg.time_kernel,
+                                                                     arg.inOutLengths,
+                                                                     arg.haveSavedMeanInvVar,
+                                                                     epsilon);
+        }
+        else if(arg.data_type == 1)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_bwd_nhwc_test<float, float, true>(arg.do_verification,
+                                                               arg.init_method,
+                                                               arg.time_kernel,
+                                                               arg.inOutLengths,
+                                                               arg.haveSavedMeanInvVar,
+                                                               epsilon);
+            else
+                pass = bnorm_bwd_nhwc_test<float, float, false>(arg.do_verification,
+                                                                arg.init_method,
+                                                                arg.time_kernel,
+                                                                arg.inOutLengths,
+                                                                arg.haveSavedMeanInvVar,
+                                                                epsilon);
+        }
+        else if(arg.data_type == 5)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_bwd_nhwc_test<ck::bhalf_t, float, true>(arg.do_verification,
+                                                                     arg.init_method,
+                                                                     arg.time_kernel,
+                                                                     arg.inOutLengths,
+                                                                     arg.haveSavedMeanInvVar,
+                                                                     epsilon);
+            else
+                pass = bnorm_bwd_nhwc_test<ck::bhalf_t, float, false>(arg.do_verification,
+                                                                      arg.init_method,
+                                                                      arg.time_kernel,
+                                                                      arg.inOutLengths,
+                                                                      arg.haveSavedMeanInvVar,
+                                                                      epsilon);
+        }
+        else if(arg.data_type == 6)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_bwd_nhwc_test<double, double, true>(arg.do_verification,
+                                                                 arg.init_method,
+                                                                 arg.time_kernel,
+                                                                 arg.inOutLengths,
+                                                                 arg.haveSavedMeanInvVar,
+                                                                 epsilon);
+            else
+                pass = bnorm_bwd_nhwc_test<double, double, false>(arg.do_verification,
+                                                                  arg.init_method,
+                                                                  arg.time_kernel,
+                                                                  arg.inOutLengths,
+                                                                  arg.haveSavedMeanInvVar,
+                                                                  epsilon);
+        }
+    }
+    else
+    {
+        pass = bnorm_bwd_nhwc_test<ck::half_t, float, true>(true,
+                                                            3,
+                                                            false, // don't time kernel
+                                                            {128, 16, 6, 512},
+                                                            false,
+                                                            epsilon);
+
+        pass = pass && bnorm_bwd_nhwc_test<ck::half_t, float, false>(true,
+                                                                     3,
+                                                                     false, // don't time kernel
+                                                                     {128, 16, 3, 1024},
+                                                                     false,
+                                                                     epsilon);
+    };
+
+    return (pass ? 0 : 1);
+}
--- a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
--- a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
--- a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t Rank, index_t NumBatchNormReduceDim, typename DyElementwiseOp>
+struct DeviceBatchNormBwd : public BaseOperator
+{
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
+                        const std::array<index_t, Rank> xStrides,
+                        const std::array<index_t, Rank> dyStrides,
+                        const std::array<index_t, Rank> dxStrides,
+                        const std::array<int, NumBatchNormReduceDim> reduceDims,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnBiasStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* p_dy,
+                        const void* p_scale,
+                        const void* p_savedMean,
+                        const void* p_savedInvVar,
+                        double epsilon,
+                        const DyElementwiseOp dy_elementwise_op,
+                        void* p_dx,
+                        void* p_dscale,
+                        void* p_dbias) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <index_t Rank, index_t NumBatchNormReduceDim, typename DyElementwiseOp>
+using DeviceBatchNormBwdPtr =
+    std::unique_ptr<DeviceBatchNormBwd<Rank, NumBatchNormReduceDim, DyElementwiseOp>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
@@ -0,0 +1,866 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/welford_helper.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim,
+          bool UseMultiblockInK,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XDyDxVectorDim,
+          index_t XSrcVectorSize,
+          index_t DySrcVectorSize,
+          index_t DxDstVectorSize,
+          index_t ScaleSrcDstVectorSize,
+          index_t BiasDstVectorSize,
+          index_t MeanVarSrcVectorSize>
+struct DeviceBatchNormBwdImpl
+    : public DeviceBatchNormBwd<Rank, NumBatchNormReduceDim, DyElementwiseOp>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
+                  "Invalid thread cluster size assignments!");
+
+    static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
+                   MThreadSliceSize % DySrcVectorSize == 0 &&
+                   MThreadSliceSize % DxDstVectorSize == 0) ||
+                      (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
+                       KThreadSliceSize % DySrcVectorSize == 0 &&
+                       KThreadSliceSize % DxDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeXY2dDescriptor(const std::array<index_t, Rank>& xyLengths,
+                                   const std::array<index_t, Rank>& xyStrides,
+                                   int blkGroupSize,
+                                   int numBlockTileIteration)
+    {
+        const auto tupleXYLengths =
+            generate_tuple([&](auto I) { return xyLengths[I]; }, Number<Rank>{});
+        const auto tupleXYStrides =
+            generate_tuple([&](auto I) { return xyStrides[I]; }, Number<Rank>{});
+
+        const auto raw_grid_desc = make_naive_tensor_descriptor(tupleXYLengths, tupleXYStrides);
+
+        const auto grid_desc_m_k = [&]() {
+            using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+            using ReduceDims    = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+            const auto reduceDimLengths =
+                generate_tuple([&](auto I) { return xyLengths[NumInvariantDim + I]; },
+                               Number<NumBatchNormReduceDim>{});
+            const auto invariantDimLengths =
+                generate_tuple([&](auto I) { return xyLengths[I]; }, Number<NumInvariantDim>{});
+
+            return transform_tensor_descriptor(raw_grid_desc,
+                                               make_tuple(make_merge_transform(invariantDimLengths),
+                                                          make_merge_transform(reduceDimLengths)),
+                                               make_tuple(InvariantDims{}, ReduceDims{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }();
+
+        const auto invariantLength = grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = grid_desc_m_k.GetLength(Number<1>{});
+
+        const int workSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto kPad = workSizePerBlock * blkGroupSize - reduceLength;
+
+        auto grid_desc_m_k_padded =
+            transform_tensor_descriptor(grid_desc_m_k,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_right_pad_transform(reduceLength, kPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_k_padded);
+    };
+
+    static auto MakeMultiblockFirstReduceOutputMG2dDescriptor(int invariantLength, int blkGroupSize)
+    {
+        const auto grid_desc_m_g =
+            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto grid_desc_m_g_padded =
+            transform_tensor_descriptor(grid_desc_m_g,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_pass_through_transform(blkGroupSize)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_g_padded);
+    };
+
+    static auto MakeMultiblockFinalReduceInputMK2dDescriptor(int invariantLength, int blkGroupSize)
+    {
+        const auto reduceLength = blkGroupSize;
+        const auto grid_desc_m_k =
+            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, reduceLength));
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto kPad =
+            math::integer_least_multiple(reduceLength, KThreadClusterSize) - reduceLength;
+
+        auto grid_desc_m_k_padded =
+            transform_tensor_descriptor(grid_desc_m_k,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_right_pad_transform(reduceLength, kPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_k_padded);
+    };
+
+    static auto
+    MakeScaleBiasMeanVar1dDescriptor(const std::array<index_t, NumInvariantDim>& lengths,
+                                     const std::array<index_t, NumInvariantDim>& strides)
+    {
+        const auto tupleLengths =
+            generate_tuple([&](auto I) { return lengths[I]; }, Number<NumInvariantDim>{});
+        const auto tupleStrides =
+            generate_tuple([&](auto I) { return strides[I]; }, Number<NumInvariantDim>{});
+
+        auto raw_grid_desc = make_naive_tensor_descriptor(tupleLengths, tupleStrides);
+
+        auto grid_desc_m = transform_tensor_descriptor(
+            raw_grid_desc,
+            make_tuple(make_merge_transform(tupleLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto invariantLength = grid_desc_m.GetLength(Number<0>{});
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto grid_desc_m_padded =
+            transform_tensor_descriptor(grid_desc_m,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return (grid_desc_m_padded);
+    };
+
+    using XYGridDesc_M_K      = decltype(MakeXY2dDescriptor({1}, {1}, 1, 1));
+    using ScaleBiasGridDesc_M = decltype(MakeScaleBiasMeanVar1dDescriptor({1}, {1}));
+    using MeanVarGridDesc_M   = ScaleBiasGridDesc_M;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> xyLengths,
+                 const std::array<index_t, Rank> xStrides,
+                 const std::array<index_t, Rank> dyStrides,
+                 const std::array<index_t, Rank> dxStrides,
+                 const std::array<int, NumBatchNormReduceDim> reduceDims,
+                 const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                 const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
+                 const std::array<ck::index_t, NumInvariantDim> bnBiasStrides,
+                 const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
+                 const XDataType* p_x,
+                 const DyDataType* p_dy,
+                 const ScaleDataType* p_scale,
+                 const MeanVarDataType* p_savedMean,
+                 const MeanVarDataType* p_savedInvVar,
+                 const DyElementwiseOp dy_elementwise_op,
+                 double epsilon,
+                 DxDataType* p_dx,
+                 ScaleDataType* p_dscale,
+                 BiasDataType* p_dbias)
+            : bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths),
+              bnScaleStrides_(bnScaleStrides),
+              bnBiasStrides_(bnBiasStrides),
+              bnMeanVarStrides_(bnMeanVarStrides),
+              p_x_(p_x),
+              p_dy_(p_dy),
+              p_scale_(p_scale),
+              p_savedMean_(p_savedMean),
+              p_savedInvVar_(p_savedInvVar),
+              dy_elementwise_op_(dy_elementwise_op),
+              p_dx_(p_dx),
+              p_dscale_(p_dscale),
+              p_dbias_(p_dbias)
+        {
+            xyLengths_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xyLengths, reduceDims);
+            xStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xStrides, reduceDims);
+            dyStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(dyStrides, reduceDims);
+            dxStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(dxStrides, reduceDims);
+
+            std::tie(invariant_length, reduce_length) =
+                get_2d_lengths<Rank, NumBatchNormReduceDim>(xyLengths_);
+
+            epsilon_ = type_convert<AccDataType>(epsilon);
+
+            haveSavedMeanInvVar_ = (p_savedMean_ != nullptr && p_savedInvVar_ != nullptr);
+
+            if(UseMultiblockInK)
+            {
+                int iterations = 1;
+                while(true)
+                {
+                    int testBlkGroupSize = (reduce_length + (K_BlockTileSize * iterations) - 1) /
+                                           (K_BlockTileSize * iterations);
+
+                    // we want the blkGroupSize be not more than 128
+                    if(testBlkGroupSize <= 128)
+                        break;
+
+                    iterations++;
+                };
+
+                blkGroupSize = (reduce_length + (K_BlockTileSize * iterations) - 1) /
+                               (K_BlockTileSize * iterations);
+
+                numBlockTileIteration = iterations;
+            }
+            else
+            {
+                blkGroupSize          = 1;
+                numBlockTileIteration = (reduce_length + K_BlockTileSize - 1) / K_BlockTileSize;
+            };
+
+            gridSize = (invariant_length + M_BlockTileSize - 1) / M_BlockTileSize * blkGroupSize;
+
+            x_grid_desc_m_k =
+                MakeXY2dDescriptor(xyLengths_, xStrides_, blkGroupSize, numBlockTileIteration);
+            dy_grid_desc_m_k =
+                MakeXY2dDescriptor(xyLengths_, dyStrides_, blkGroupSize, numBlockTileIteration);
+            dx_grid_desc_m_k =
+                MakeXY2dDescriptor(xyLengths_, dxStrides_, blkGroupSize, numBlockTileIteration);
+            scale_grid_desc_m =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnScaleStrides);
+            bias_grid_desc_m =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnBiasStrides);
+            mean_var_grid_desc_m =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnMeanVarStrides);
+        }
+
+        AccDataType epsilon_;
+
+        bool haveSavedMeanInvVar_;
+
+        std::array<index_t, Rank> xyLengths_;
+        std::array<index_t, Rank> xStrides_;
+        std::array<index_t, Rank> dyStrides_;
+        std::array<index_t, Rank> dxStrides_;
+
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides_;
+
+        const XDataType* p_x_;
+        const DyDataType* p_dy_;
+        const ScaleDataType* p_scale_;
+        const MeanVarDataType* p_savedMean_;
+        const MeanVarDataType* p_savedInvVar_;
+        const DyElementwiseOp dy_elementwise_op_;
+        DxDataType* p_dx_;
+        ScaleDataType* p_dscale_;
+        BiasDataType* p_dbias_;
+
+        long_index_t invariant_length;
+        long_index_t reduce_length;
+
+        int blkGroupSize;
+        int numBlockTileIteration;
+        size_t gridSize;
+
+        XYGridDesc_M_K x_grid_desc_m_k;
+        XYGridDesc_M_K dy_grid_desc_m_k;
+        XYGridDesc_M_K dx_grid_desc_m_k;
+        ScaleBiasGridDesc_M scale_grid_desc_m;
+        ScaleBiasGridDesc_M bias_grid_desc_m;
+        MeanVarGridDesc_M mean_var_grid_desc_m;
+
+        void* workspace_mean;
+        void* workspace_variance;
+        void* workspace_count;
+
+        void* workspace_savedMean;
+        void* workspace_savedInvVar;
+
+        void* workspace_reduce_dscale;
+        void* workspace_reduce_dbias;
+    };
+
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        size_t workspace_size = 0;
+
+        if(UseMultiblockInK && pArg_->blkGroupSize > 1)
+        {
+            // workspace for the partial reduced result for dscale
+            workspace_size +=
+                pArg_->invariant_length * pArg_->blkGroupSize * sizeof(ScaleDataType) + 64;
+
+            // workspace for the partial reduced result for dbias
+            workspace_size +=
+                pArg_->invariant_length * pArg_->blkGroupSize * sizeof(BiasDataType) + 64;
+
+            if(!pArg_->haveSavedMeanInvVar_)
+            {
+                // workspace for welford intermediate mean
+                workspace_size +=
+                    pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType) + 64;
+
+                // workspace for welford intermediate variance
+                workspace_size +=
+                    pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType) + 64;
+
+                // workspace for welford intermediate count
+                workspace_size +=
+                    pArg_->invariant_length * pArg_->blkGroupSize * sizeof(int32_t) + 64;
+
+                // workspace for welford result mean
+                workspace_size += pArg_->invariant_length * sizeof(MeanVarDataType) + 64;
+
+                // workspace for welford result inv_variance
+                workspace_size += pArg_->invariant_length * sizeof(MeanVarDataType) + 64;
+            };
+        }
+
+        return (workspace_size);
+    };
+
+    void SetWorkSpacePointer(BaseArgument* pArg, void* p_workspace) const override
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(pArg);
+
+        pArg_->p_workspace_ = p_workspace;
+
+        index_t space_sz;
+
+        // setup buffer for the partial reduced result for dscale
+        pArg_->workspace_reduce_dscale = pArg_->p_workspace_;
+
+        space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(ScaleDataType);
+        space_sz = math::integer_least_multiple(space_sz, 64);
+
+        // setup buffer for the partial reduced result for dbias
+        pArg_->workspace_reduce_dbias =
+            reinterpret_cast<char*>(pArg_->workspace_reduce_dscale) + space_sz;
+
+        if(UseMultiblockInK && pArg_->blkGroupSize > 1)
+        {
+            space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(BiasDataType);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford intermediate mean
+            pArg_->workspace_mean =
+                reinterpret_cast<char*>(pArg_->workspace_reduce_dbias) + space_sz;
+
+            space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford intermediate varirance
+            pArg_->workspace_variance = reinterpret_cast<char*>(pArg_->workspace_mean) + space_sz;
+
+            space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford intermediate count
+            pArg_->workspace_count = reinterpret_cast<char*>(pArg_->workspace_variance) + space_sz;
+
+            space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(int32_t);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford result mean
+            pArg_->workspace_savedMean = reinterpret_cast<char*>(pArg_->workspace_count) + space_sz;
+
+            space_sz = pArg_->invariant_length * sizeof(MeanVarDataType);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford result inv_variance
+            pArg_->workspace_savedInvVar =
+                reinterpret_cast<char*>(pArg_->workspace_savedMean) + space_sz;
+        };
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float avg_time = 0;
+
+            const auto mean_var_count_grid_desc_m_g =
+                DeviceBatchNormBwdImpl::MakeMultiblockFirstReduceOutputMG2dDescriptor(
+                    arg.invariant_length, arg.blkGroupSize);
+
+            const auto dscale_dbias_grid_desc_m_g =
+                DeviceBatchNormBwdImpl::MakeMultiblockFirstReduceOutputMG2dDescriptor(
+                    arg.invariant_length, arg.blkGroupSize);
+
+            const auto mean_var_count_grid_desc_m_k =
+                DeviceBatchNormBwdImpl::MakeMultiblockFinalReduceInputMK2dDescriptor(
+                    arg.invariant_length, arg.blkGroupSize);
+
+            const auto dscale_dbias_grid_desc_m_k =
+                DeviceBatchNormBwdImpl::MakeMultiblockFinalReduceInputMK2dDescriptor(
+                    arg.invariant_length, arg.blkGroupSize);
+
+            using MeanVarCountGridDesc_M_G = decltype(mean_var_count_grid_desc_m_g);
+            using MeanVarCountGridDesc_M_K = decltype(mean_var_count_grid_desc_m_k);
+            using DscaleDbiasGridDesc_M_G  = decltype(dscale_dbias_grid_desc_m_g);
+            using DscaleDbiasGridDesc_M_K  = decltype(dscale_dbias_grid_desc_m_k);
+
+            using GridwiseWelfordSecondHalfReduceFirstHalf_ =
+                GridwiseWelfordSecondHalfReduceFirstHalf<XDataType,
+                                                         DyDataType,
+                                                         AccDataType,
+                                                         ScaleDataType,
+                                                         BiasDataType,
+                                                         MeanVarDataType,
+                                                         DyElementwiseOp,
+                                                         XYGridDesc_M_K,
+                                                         MeanVarGridDesc_M,
+                                                         MeanVarCountGridDesc_M_K,
+                                                         DscaleDbiasGridDesc_M_G,
+                                                         BlockSize,
+                                                         MThreadClusterSize,
+                                                         KThreadClusterSize,
+                                                         MThreadSliceSize,
+                                                         KThreadSliceSize,
+                                                         XDyDxVectorDim,
+                                                         XSrcVectorSize,
+                                                         DySrcVectorSize,
+                                                         MeanVarSrcVectorSize>;
+
+            using GridwiseReduceSecondHalfBatchNormBwdFinal_ =
+                GridwiseReduceSecondHalfBatchNormBackwardFinal<XDataType,
+                                                               DyDataType,
+                                                               DxDataType,
+                                                               AccDataType,
+                                                               ScaleDataType,
+                                                               BiasDataType,
+                                                               MeanVarDataType,
+                                                               DyElementwiseOp,
+                                                               XYGridDesc_M_K,
+                                                               DscaleDbiasGridDesc_M_K,
+                                                               MeanVarGridDesc_M,
+                                                               ScaleBiasGridDesc_M,
+                                                               BlockSize,
+                                                               MThreadClusterSize,
+                                                               KThreadClusterSize,
+                                                               MThreadSliceSize,
+                                                               KThreadSliceSize,
+                                                               XDyDxVectorDim,
+                                                               XSrcVectorSize,
+                                                               DySrcVectorSize,
+                                                               DxDstVectorSize,
+                                                               ScaleSrcDstVectorSize,
+                                                               BiasDstVectorSize,
+                                                               MeanVarSrcVectorSize>;
+
+            if(UseMultiblockInK && arg.blkGroupSize > 1)
+            {
+                using GetReduceCountPerThreadFunctor =
+                    GetReduceCountPerThreadForMultiblockWelford<K_BlockTileSize, KThreadSliceSize>;
+
+                GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
+                    arg.blkGroupSize, arg.numBlockTileIteration, arg.reduce_length);
+
+                if(!arg.haveSavedMeanInvVar_)
+                {
+                    using GridwiseMultiblockWelfordFirstHalf_ =
+                        GridwiseMultiblockWelfordFirstHalf<XDataType,
+                                                           AccDataType,
+                                                           MeanVarDataType,
+                                                           XYGridDesc_M_K,
+                                                           MeanVarCountGridDesc_M_G,
+                                                           GetReduceCountPerThreadFunctor,
+                                                           BlockSize,
+                                                           MThreadClusterSize,
+                                                           KThreadClusterSize,
+                                                           MThreadSliceSize,
+                                                           KThreadSliceSize,
+                                                           XDyDxVectorDim,
+                                                           XSrcVectorSize>;
+
+                    const auto kern_multiblock_welford_first_half =
+                        kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
+                                                             XDataType,
+                                                             MeanVarDataType,
+                                                             XYGridDesc_M_K,
+                                                             MeanVarCountGridDesc_M_G,
+                                                             GetReduceCountPerThreadFunctor>;
+
+                    avg_time += launch_and_time_kernel(
+                        stream_config,
+                        kern_multiblock_welford_first_half,
+                        dim3(arg.gridSize),
+                        dim3(BlockSize),
+                        0,
+                        arg.x_grid_desc_m_k,
+                        mean_var_count_grid_desc_m_g,
+                        get_reduce_count_per_thread,
+                        arg.numBlockTileIteration,
+                        arg.p_x_,
+                        static_cast<MeanVarDataType*>(arg.workspace_mean),
+                        static_cast<MeanVarDataType*>(arg.workspace_variance),
+                        static_cast<int32_t*>(arg.workspace_count));
+                };
+
+                const auto kern_welford_second_half_reduce_first_half =
+                    kernel_welford_second_half_reduce_first_half<
+                        GridwiseWelfordSecondHalfReduceFirstHalf_,
+                        XDataType,
+                        DyDataType,
+                        AccDataType,
+                        ScaleDataType,
+                        BiasDataType,
+                        MeanVarDataType,
+                        DyElementwiseOp,
+                        XYGridDesc_M_K,
+                        MeanVarGridDesc_M,
+                        MeanVarCountGridDesc_M_K,
+                        DscaleDbiasGridDesc_M_G>;
+
+                const auto kern_reduce_second_half_batchnorm_backward_final =
+                    kernel_reduce_second_half_batchnorm_backward_final<
+                        GridwiseReduceSecondHalfBatchNormBwdFinal_,
+                        XDataType,
+                        DyDataType,
+                        DxDataType,
+                        ScaleDataType,
+                        BiasDataType,
+                        MeanVarDataType,
+                        DyElementwiseOp,
+                        XYGridDesc_M_K,
+                        DscaleDbiasGridDesc_M_K,
+                        MeanVarGridDesc_M,
+                        ScaleBiasGridDesc_M>;
+
+                index_t numDscaleDbiasBlockTileIteration =
+                    (arg.blkGroupSize + KThreadClusterSize - 1) / KThreadClusterSize;
+
+                avg_time += launch_and_time_kernel(
+                    stream_config,
+                    kern_welford_second_half_reduce_first_half,
+                    dim3(arg.gridSize),
+                    dim3(BlockSize),
+                    0,
+                    arg.x_grid_desc_m_k,
+                    arg.dy_grid_desc_m_k,
+                    arg.mean_var_grid_desc_m,
+                    mean_var_count_grid_desc_m_k,
+                    dscale_dbias_grid_desc_m_g,
+                    arg.blkGroupSize,
+                    arg.numBlockTileIteration,
+                    numDscaleDbiasBlockTileIteration,
+                    arg.epsilon_,
+                    arg.haveSavedMeanInvVar_,
+                    arg.haveSavedMeanInvVar_ ? arg.p_savedMean_ : nullptr,
+                    arg.haveSavedMeanInvVar_ ? arg.p_savedInvVar_ : nullptr,
+                    arg.haveSavedMeanInvVar_
+                        ? nullptr
+                        : static_cast<const MeanVarDataType*>(arg.workspace_mean),
+                    arg.haveSavedMeanInvVar_
+                        ? nullptr
+                        : static_cast<const MeanVarDataType*>(arg.workspace_variance),
+                    arg.haveSavedMeanInvVar_ ? nullptr
+                                             : static_cast<const int32_t*>(arg.workspace_count),
+                    arg.dy_elementwise_op_,
+                    arg.haveSavedMeanInvVar_
+                        ? nullptr
+                        : static_cast<MeanVarDataType*>(arg.workspace_savedMean),
+                    arg.haveSavedMeanInvVar_
+                        ? nullptr
+                        : static_cast<MeanVarDataType*>(arg.workspace_savedInvVar),
+                    arg.p_x_,
+                    arg.p_dy_,
+                    static_cast<ScaleDataType*>(arg.workspace_reduce_dscale),
+                    static_cast<BiasDataType*>(arg.workspace_reduce_dbias));
+
+                avg_time += launch_and_time_kernel(
+                    stream_config,
+                    kern_reduce_second_half_batchnorm_backward_final,
+                    dim3(arg.gridSize),
+                    dim3(BlockSize),
+                    0,
+                    arg.x_grid_desc_m_k,
+                    arg.dy_grid_desc_m_k,
+                    arg.dx_grid_desc_m_k,
+                    dscale_dbias_grid_desc_m_k,
+                    arg.mean_var_grid_desc_m,
+                    arg.scale_grid_desc_m,
+                    arg.bias_grid_desc_m,
+                    arg.blkGroupSize,
+                    arg.reduce_length,
+                    arg.numBlockTileIteration,
+                    numDscaleDbiasBlockTileIteration,
+                    static_cast<const ScaleDataType*>(arg.workspace_reduce_dscale),
+                    static_cast<const BiasDataType*>(arg.workspace_reduce_dbias),
+                    arg.haveSavedMeanInvVar_
+                        ? arg.p_savedMean_
+                        : static_cast<const MeanVarDataType*>(arg.workspace_savedMean),
+                    arg.haveSavedMeanInvVar_
+                        ? arg.p_savedInvVar_
+                        : static_cast<const MeanVarDataType*>(arg.workspace_savedInvVar),
+                    arg.p_x_,
+                    arg.p_dy_,
+                    arg.p_scale_,
+                    arg.dy_elementwise_op_,
+                    arg.p_dx_,
+                    arg.p_dscale_,
+                    arg.p_dbias_);
+            }
+            else
+            {
+                using GetReduceCountPerThreadFunctor =
+                    GetReduceCountPerThreadForBlockwiseWelford<K_BlockTileSize, KThreadSliceSize>;
+
+                GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
+                    arg.numBlockTileIteration, arg.reduce_length);
+
+                using GridwiseBatchNormBackwardWithBlockwiseWelford_ =
+                    GridwiseBatchNormBackwardWithBlockwiseWelford<XDataType,
+                                                                  DyDataType,
+                                                                  DxDataType,
+                                                                  AccDataType,
+                                                                  ScaleDataType,
+                                                                  BiasDataType,
+                                                                  MeanVarDataType,
+                                                                  DyElementwiseOp,
+                                                                  XYGridDesc_M_K,
+                                                                  ScaleBiasGridDesc_M,
+                                                                  MeanVarGridDesc_M,
+                                                                  GetReduceCountPerThreadFunctor,
+                                                                  BlockSize,
+                                                                  MThreadClusterSize,
+                                                                  KThreadClusterSize,
+                                                                  MThreadSliceSize,
+                                                                  KThreadSliceSize,
+                                                                  XDyDxVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  DySrcVectorSize,
+                                                                  DxDstVectorSize,
+                                                                  ScaleSrcDstVectorSize,
+                                                                  BiasDstVectorSize,
+                                                                  MeanVarSrcVectorSize>;
+
+                const auto kern_batchnorm_bwd = kernel_batchnorm_backward_with_blockwise_welford<
+                    GridwiseBatchNormBackwardWithBlockwiseWelford_,
+                    XDataType,
+                    DyDataType,
+                    DxDataType,
+                    AccDataType,
+                    ScaleDataType,
+                    BiasDataType,
+                    MeanVarDataType,
+                    DyElementwiseOp,
+                    XYGridDesc_M_K,
+                    ScaleBiasGridDesc_M,
+                    MeanVarGridDesc_M,
+                    GetReduceCountPerThreadFunctor>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kern_batchnorm_bwd,
+                                                   dim3(arg.gridSize),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.x_grid_desc_m_k,
+                                                   arg.dy_grid_desc_m_k,
+                                                   arg.dx_grid_desc_m_k,
+                                                   arg.scale_grid_desc_m,
+                                                   arg.bias_grid_desc_m,
+                                                   arg.mean_var_grid_desc_m,
+                                                   get_reduce_count_per_thread,
+                                                   arg.reduce_length,
+                                                   arg.numBlockTileIteration,
+                                                   arg.epsilon_,
+                                                   arg.p_x_,
+                                                   arg.p_dy_,
+                                                   arg.p_scale_,
+                                                   arg.haveSavedMeanInvVar_,
+                                                   arg.p_savedMean_,
+                                                   arg.p_savedInvVar_,
+                                                   arg.dy_elementwise_op_,
+                                                   arg.p_dx_,
+                                                   arg.p_dscale_,
+                                                   arg.p_dbias_);
+            };
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* pArg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(pArg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* pArg) override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        if constexpr(XDyDxVectorDim == 0)
+        {
+            if(pArg_->xStrides_[NumInvariantDim - 1] != 1 ||
+               pArg_->dyStrides_[NumInvariantDim - 1] != 1 ||
+               pArg_->dxStrides_[NumInvariantDim - 1] != 1)
+                return false;
+
+            if(pArg_->xyLengths_[NumInvariantDim - 1] % XSrcVectorSize != 0 ||
+               pArg_->xyLengths_[NumInvariantDim - 1] % DySrcVectorSize != 0 ||
+               pArg_->xyLengths_[NumInvariantDim - 1] % DxDstVectorSize != 0)
+                return false;
+        }
+        else
+        {
+            if(pArg_->xStrides_[Rank - 1] != 1 || pArg_->dyStrides_[Rank - 1] != 1 ||
+               pArg_->dxStrides_[Rank - 1] != 1)
+                return false;
+
+            if(pArg_->xyLengths_[Rank - 1] % XSrcVectorSize != 0 ||
+               pArg_->xyLengths_[Rank - 1] % DySrcVectorSize != 0 ||
+               pArg_->xyLengths_[Rank - 1] % DxDstVectorSize != 0)
+                return false;
+        };
+
+        if(pArg_->bnScaleStrides_[NumInvariantDim - 1] != 1 && ScaleSrcDstVectorSize != 1)
+            return false;
+
+        if(pArg_->bnBiasStrides_[NumInvariantDim - 1] != 1 && BiasDstVectorSize != 1)
+            return false;
+
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % ScaleSrcDstVectorSize != 0)
+            return false;
+
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % BiasDstVectorSize != 0)
+            return false;
+
+        if(pArg_->haveSavedMeanInvVar_)
+        {
+            if(pArg_->bnMeanVarStrides_[NumInvariantDim - 1] != 1 && MeanVarSrcVectorSize != 1)
+                return false;
+
+            if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % MeanVarSrcVectorSize != 0)
+                return false;
+        };
+
+        bool is_valid = true;
+
+        static_for<0, NumInvariantDim, 1>{}([&](auto I) {
+            if(pArg_->xyLengths_[I] != pArg_->bnScaleBiasMeanVarLengths_[I])
+                is_valid = false;
+        });
+
+        if(!is_valid)
+            return false;
+
+        return true;
+    };
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
+                        const std::array<index_t, Rank> xStrides,
+                        const std::array<index_t, Rank> dyStrides,
+                        const std::array<index_t, Rank> dxStrides,
+                        const std::array<int, NumBatchNormReduceDim> reduceDims,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnBiasStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* p_dy,
+                        const void* p_scale,
+                        const void* p_savedMean,
+                        const void* p_savedInvVar,
+                        double epsilon,
+                        const DyElementwiseOp dy_elementwise_op,
+                        void* p_dx,
+                        void* p_dscale,
+                        void* p_dbias) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          dyStrides,
+                                          dxStrides,
+                                          reduceDims,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleStrides,
+                                          bnBiasStrides,
+                                          bnMeanVarStrides,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const DyDataType*>(p_dy),
+                                          static_cast<const ScaleDataType*>(p_scale),
+                                          static_cast<const MeanVarDataType*>(p_savedMean),
+                                          static_cast<const MeanVarDataType*>(p_savedInvVar),
+                                          dy_elementwise_op,
+                                          epsilon,
+                                          static_cast<DxDataType*>(p_dx),
+                                          static_cast<ScaleDataType*>(p_dscale),
+                                          static_cast<BiasDataType*>(p_dbias));
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchNormBwdImpl<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "XDyDxVectorDim_" << XDyDxVectorDim  << ",";
+        str << "VectorSize_X" << XSrcVectorSize << "_scale_" << ScaleSrcDstVectorSize << "_bias_" << BiasDstVectorSize << "_mean_var_" << MeanVarSrcVectorSize << "_Dx_" << DxDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+}; // namespace device
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseReduceSecondHalfBatchNormBackwardFinal_,
+          typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename DscaleDbiasGridDesc_M_K,
+          typename MeanVarGridDesc_M,
+          typename ScaleBiasGridDesc_M>
+__global__ void kernel_reduce_second_half_batchnorm_backward_final(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K dy_grid_desc_m_k,
+    const XYGridDesc_M_K dx_grid_desc_m_k,
+    const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    const ScaleBiasGridDesc_M scale_grid_desc_m,
+    const ScaleBiasGridDesc_M bias_grid_desc_m,
+    index_t blkgroup_size,
+    long_index_t reduce_size,
+    index_t num_xy_k_block_tile_iteration,
+    index_t num_dscale_dbias_k_block_tile_iteration,
+    const ScaleDataType* const __restrict__ p_reduce_dscale,
+    const BiasDataType* const __restrict__ p_reduce_dbias,
+    const MeanVarDataType* const __restrict__ p_mean,
+    const MeanVarDataType* const __restrict__ p_inv_var,
+    const XDataType* const __restrict__ p_x,
+    const DyDataType* const __restrict__ p_dy,
+    const ScaleDataType* const __restrict__ p_scale,
+    const DyElementwiseOp dy_elementwise_op,
+    DxDataType* const __restrict__ p_dx,
+    ScaleDataType* const __restrict__ p_dscale,
+    BiasDataType* const __restrict__ p_dbias)
+{
+    GridwiseReduceSecondHalfBatchNormBackwardFinal_::Run(x_grid_desc_m_k,
+                                                         dy_grid_desc_m_k,
+                                                         dx_grid_desc_m_k,
+                                                         dscale_dbias_grid_desc_m_k,
+                                                         mean_var_grid_desc_m,
+                                                         scale_grid_desc_m,
+                                                         bias_grid_desc_m,
+                                                         blkgroup_size,
+                                                         reduce_size,
+                                                         num_xy_k_block_tile_iteration,
+                                                         num_dscale_dbias_k_block_tile_iteration,
+                                                         p_reduce_dscale,
+                                                         p_reduce_dbias,
+                                                         p_mean,
+                                                         p_inv_var,
+                                                         p_x,
+                                                         p_dy,
+                                                         p_scale,
+                                                         dy_elementwise_op,
+                                                         p_dx,
+                                                         p_dscale,
+                                                         p_dbias);
+};
+
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename DscaleDbiasGridDesc_M_K,
+          typename MeanVarGridDesc_M,
+          typename ScaleBiasGridDesc_M,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XDyDxVectorDim,
+          index_t XSrcVectorSize,
+          index_t DySrcVectorSize,
+          index_t DxDstVectorSize,
+          index_t ScaleSrcDstVectorSize,
+          index_t BiasDstVectorSize,
+          index_t MeanVarSrcVectorSize>
+struct GridwiseReduceSecondHalfBatchNormBackwardFinal
+{
+    static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
+                   MThreadSliceSize % DySrcVectorSize == 0 &&
+                   MThreadSliceSize % DxDstVectorSize == 0) ||
+                      (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
+                       KThreadSliceSize % DySrcVectorSize == 0 &&
+                       KThreadSliceSize % DxDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XDyDxVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_1 = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, Number<1>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ck::reduce::Add,
+                                                          false>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_1,
+                                                 ThreadReduceDstDesc_M,
+                                                 ck::reduce::Add,
+                                                 false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    // clang-format off
+    // Two of the steps of Multiblock BatchNorm Backward
+    // Step 1: Second half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
+    // Step 2: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
+    // clang-format on
+    __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGridDesc_M_K& dy_grid_desc_m_k,
+                               const XYGridDesc_M_K& dx_grid_desc_m_k,
+                               const DscaleDbiasGridDesc_M_K& dscale_dbias_grid_desc_m_k,
+                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
+                               const ScaleBiasGridDesc_M& scale_grid_desc_m,
+                               const ScaleBiasGridDesc_M& bias_grid_desc_m,
+                               index_t blkgroup_size,
+                               long_index_t reduce_size,
+                               index_t num_xy_k_block_tile_iteration,
+                               index_t num_dscale_dbias_k_block_tile_iteration,
+                               const ScaleDataType* const __restrict__ p_reduce_dscale,
+                               const BiasDataType* const __restrict__ p_reduce_dbias,
+                               const MeanVarDataType* const __restrict__ p_mean,
+                               const MeanVarDataType* const __restrict__ p_inv_var,
+                               const XDataType* const __restrict__ p_x,
+                               const DyDataType* const __restrict__ p_dy,
+                               const ScaleDataType* const __restrict__ p_scale,
+                               const DyElementwiseOp dy_elementwise_op,
+                               DxDataType* const __restrict__ p_dx,
+                               ScaleDataType* const __restrict__ p_dscale,
+                               BiasDataType* const __restrict__ p_dbias)
+    {
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            reduce_dscale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            reduce_dbias_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dscale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dbias_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dy_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dx_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            inv_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / blkgroup_size;
+        const index_t block_local_id  = block_global_id % blkgroup_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M           = Sequence<MThreadSliceSize>;
+        using ThreadBufferLengths_M_1         = Sequence<MThreadSliceSize, 1>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+
+        // clang-format off
+        // Step 1: do final reduction of dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
+        // clang-format on
+
+        auto threadwise_dscale_load_m_k =
+            ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
+                                             AccDataType,
+                                             DscaleDbiasGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                dscale_dbias_grid_desc_m_k,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * 1));
+
+        auto threadwise_dbias_load_m_k =
+            ThreadwiseTensorSliceTransfer_v2<BiasDataType,
+                                             AccDataType,
+                                             DscaleDbiasGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                dscale_dbias_grid_desc_m_k,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * 1));
+
+        auto threadwise_dscale_store_m =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               ScaleDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               ScaleBiasGridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,
+                                               0,
+                                               ScaleSrcDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                scale_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_dbias_store_m =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               BiasDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               ScaleBiasGridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,
+                                               0,
+                                               BiasDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                bias_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        const auto reduce_dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_reduce_dscale, dscale_dbias_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto reduce_dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_reduce_dbias, dscale_dbias_grid_desc_m_k.GetElementSpaceSize());
+
+        auto dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dscale, scale_grid_desc_m.GetElementSpaceSize());
+
+        auto dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dbias, bias_grid_desc_m.GetElementSpaceSize());
+
+        constexpr auto dscale_dbias_thread_copy_step_m_k =
+            make_multi_index(0, KThreadClusterSize * 1);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            dscale_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            dbias_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_dscale_dbias_k_block_tile_iteration;
+            ++reducedTiles)
+        {
+            threadwise_dscale_load_m_k.Run(dscale_dbias_grid_desc_m_k,
+                                           reduce_dscale_global_buf,
+                                           thread_buffer_desc_m_1,
+                                           make_tuple(I0, I0),
+                                           reduce_dscale_thread_buf);
+
+            threadwise_dbias_load_m_k.Run(dscale_dbias_grid_desc_m_k,
+                                          reduce_dbias_global_buf,
+                                          thread_buffer_desc_m_1,
+                                          make_tuple(I0, I0),
+                                          reduce_dbias_thread_buf);
+
+            ThreadwiseReduce::Reduce(reduce_dscale_thread_buf, dscale_thread_buf);
+            ThreadwiseReduce::Reduce(reduce_dbias_thread_buf, dbias_thread_buf);
+
+            threadwise_dscale_load_m_k.MoveSrcSliceWindow(dscale_dbias_grid_desc_m_k,
+                                                          dscale_dbias_thread_copy_step_m_k);
+            threadwise_dbias_load_m_k.MoveSrcSliceWindow(dscale_dbias_grid_desc_m_k,
+                                                         dscale_dbias_thread_copy_step_m_k);
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            BlockwiseReduce::Reduce(reduce_work_buf, dscale_thread_buf(I));
+            block_sync_lds();
+            BlockwiseReduce::Reduce(reduce_work_buf, dbias_thread_buf(I));
+        });
+
+        threadwise_dscale_store_m.Run(thread_buffer_desc_m,
+                                      make_tuple(I0),
+                                      dscale_thread_buf,
+                                      scale_grid_desc_m,
+                                      dscale_global_buf);
+
+        threadwise_dbias_store_m.Run(thread_buffer_desc_m,
+                                     make_tuple(I0),
+                                     dbias_thread_buf,
+                                     bias_grid_desc_m,
+                                     dbias_global_buf);
+
+        // clang-format off
+        // Step 2: calculate dx = 1/N * inv-variance * scale * (N * dy - dbias - dscale * (x - mean) * inv-variance)
+        // clang-format on
+
+        const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XDyDxVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
+                                                                   AccDataType,
+                                                                   XYGridDesc_M_K,
+                                                                   decltype(thread_buffer_desc_m_k),
+                                                                   ThreadBufferLengths_M_K,
+                                                                   ThreadBufferDimAccessOrder,
+                                                                   XDyDxVectorDim,
+                                                                   DySrcVectorSize,
+                                                                   1,
+                                                                   true>(
+            dy_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dx_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               DxDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGridDesc_M_K,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               XDyDxVectorDim,
+                                               DxDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dx_grid_desc_m_k,
+                make_multi_index(
+                    blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                    workSizePerBlock * block_local_id + thread_k_cluster_id * KThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_scale_load =
+            ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
+                                             AccDataType,
+                                             ScaleBiasGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             ScaleSrcDstVectorSize,
+                                             1,
+                                             true>(
+                scale_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+
+        auto threadwise_mean_var_load =
+            ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                             AccDataType,
+                                             MeanVarGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             MeanVarSrcVectorSize,
+                                             1,
+                                             true>(
+                mean_var_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+
+        const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
+
+        auto dx_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dx, dx_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto scale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_scale, scale_grid_desc_m.GetElementSpaceSize());
+
+        const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_mean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+        const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_inv_var, mean_var_grid_desc_m.GetElementSpaceSize());
+
+        threadwise_scale_load.Run(scale_grid_desc_m,
+                                  scale_global_buf,
+                                  thread_buffer_desc_m,
+                                  make_tuple(I0),
+                                  scale_thread_buf);
+
+        threadwise_mean_var_load.Run(mean_var_grid_desc_m,
+                                     mean_global_buf,
+                                     thread_buffer_desc_m,
+                                     make_tuple(I0),
+                                     mean_thread_buf);
+
+        threadwise_mean_var_load.Run(mean_var_grid_desc_m,
+                                     inv_var_global_buf,
+                                     thread_buffer_desc_m,
+                                     make_tuple(I0),
+                                     inv_var_thread_buf);
+
+        constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize);
+
+        AccDataType inv_reduce_size =
+            type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size);
+
+        for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_dy_load.Run(dy_grid_desc_m_k,
+                                   dy_global_buf,
+                                   thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   dy_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                AccDataType multiplier =
+                    inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM];
+
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    dy_elementwise_op(dy_thread_buf(Number<offset>{}),
+                                      dy_thread_buf[Number<offset>{}]);
+
+                    AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
+                                         inv_var_thread_buf[iM];
+
+                    AccDataType tmpVal = norm_x * dscale_thread_buf[iM];
+
+                    dx_thread_buf(Number<offset>{}) =
+                        multiplier *
+                        (type_convert<AccDataType>(reduce_size) * dy_thread_buf[Number<offset>{}] -
+                         dbias_thread_buf[iM] - tmpVal);
+                });
+            });
+
+            threadwise_dx_store.Run(thread_buffer_desc_m_k,
+                                    make_tuple(I0, I0),
+                                    dx_thread_buf,
+                                    dx_grid_desc_m_k,
+                                    dx_global_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k);
+            threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, xy_thread_copy_step_m_k);
+            threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, xy_thread_copy_step_m_k);
+        }
+    };
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
@@ -93,6 +93,9 @@ struct GridwiseMultiblockWelfordFirstHalf
    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;

+    // clang-format off
+    // First half of the Multiblock Welford method to calculate mean and variance, used by both batchnorm-forward and batchnorm-backward.
+    // clang-format on
    __device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k,
                               const MeanVarCountGridDesc_M_G& mean_var_count_grid_desc_m_g,
                               const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread,
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
@@ -529,6 +529,7 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
            auto result_inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize());

+            // calculate inv-variance as 1/sqrt(epsilon+variance)
            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
                welford_var_thread_buf(I) =
                    type_convert<AccDataType>(1.0f) / sqrt(epsilon + welford_var_thread_buf[I]);
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseWelfordSecondHalfReduceFirstHalf_,
+          typename XDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename MeanVarGridDesc_M,
+          typename MeanVarCountGridDesc_M_K,
+          typename DscaleDbiasGridDesc_M_G>
+__global__ void kernel_welford_second_half_reduce_first_half(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K dy_grid_desc_m_k,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k,
+    const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g,
+    index_t blkgroup_size,
+    index_t num_xy_k_block_tile_iteration,
+    index_t num_mean_var_count_k_block_tile_iteration,
+    AccDataType epsilon,
+    bool haveSavedMeanInvVar,
+    const MeanVarDataType* const __restrict__ p_savedMean,
+    const MeanVarDataType* const __restrict__ p_savedInvVar,
+    const MeanVarDataType* const __restrict__ p_in_welford_mean,
+    const MeanVarDataType* const __restrict__ p_in_welford_variance,
+    const int32_t* const __restrict__ p_in_welford_count,
+    const DyElementwiseOp dy_elementwise_op,
+    MeanVarDataType* const __restrict__ p_out_welford_mean,
+    MeanVarDataType* const __restrict__ p_out_welford_inv_variance,
+    const XDataType* const __restrict__ p_x,
+    const DyDataType* const __restrict__ p_dy,
+    ScaleDataType* const __restrict__ p_reduce_dscale,
+    BiasDataType* const __restrict__ p_reduce_dbias)
+{
+    GridwiseWelfordSecondHalfReduceFirstHalf_::Run(x_grid_desc_m_k,
+                                                   dy_grid_desc_m_k,
+                                                   mean_var_grid_desc_m,
+                                                   mean_var_count_grid_desc_m_k,
+                                                   dscale_dbias_grid_desc_m_g,
+                                                   blkgroup_size,
+                                                   num_xy_k_block_tile_iteration,
+                                                   num_mean_var_count_k_block_tile_iteration,
+                                                   epsilon,
+                                                   haveSavedMeanInvVar,
+                                                   p_savedMean,
+                                                   p_savedInvVar,
+                                                   p_in_welford_mean,
+                                                   p_in_welford_variance,
+                                                   p_in_welford_count,
+                                                   dy_elementwise_op,
+                                                   p_out_welford_mean,
+                                                   p_out_welford_inv_variance,
+                                                   p_x,
+                                                   p_dy,
+                                                   p_reduce_dscale,
+                                                   p_reduce_dbias);
+};
+
+template <typename XDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename MeanVarGridDesc_M,
+          typename MeanVarCountGridDesc_M_K,
+          typename DscaleDbiasGridDesc_M_G,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XDyVectorDim,
+          index_t XSrcVectorSize,
+          index_t DySrcVectorSize,
+          index_t MeanVarSrcVectorSize>
+struct GridwiseWelfordSecondHalfReduceFirstHalf
+{
+    static_assert((XDyVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
+                   MThreadSliceSize % DySrcVectorSize == 0) ||
+                      (XDyVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
+                       KThreadSliceSize % DySrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XDyVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceSrcDesc_M_1 = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, Number<1>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelfordMerge<AccDataType, ThreadReduceSrcDesc_M_1, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ck::reduce::Add,
+                                                          false>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_K,
+                                                 ThreadReduceDstDesc_M,
+                                                 ck::reduce::Add,
+                                                 false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    // clang-format off
+    // Two of the steps of Multiblock BatchNorm Backward
+    // Step 1: Second half of Welford method to calculate mean and variance, as well as getting inv-variance = 1/sqrt(epsilon+variance) 
+    // Step 2: First half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
+    // clang-format on
+    __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGridDesc_M_K& dy_grid_desc_m_k,
+                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
+                               const MeanVarCountGridDesc_M_K& mean_var_count_grid_desc_m_k,
+                               const DscaleDbiasGridDesc_M_G& dscale_dbias_grid_desc_m_g,
+                               index_t blkgroup_size,
+                               index_t num_xy_k_block_tile_iteration,
+                               index_t num_mean_var_count_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               bool haveSavedMeanInvVar,
+                               const MeanVarDataType* const __restrict__ p_savedMean,
+                               const MeanVarDataType* const __restrict__ p_savedInvVar,
+                               const MeanVarDataType* const __restrict__ p_in_welford_mean,
+                               const MeanVarDataType* const __restrict__ p_in_welford_variance,
+                               const int32_t* const __restrict__ p_in_welford_count,
+                               const DyElementwiseOp dy_elementwise_op,
+                               MeanVarDataType* const __restrict__ p_out_welford_mean,
+                               MeanVarDataType* const __restrict__ p_out_welford_inv_variance,
+                               const XDataType* const __restrict__ p_x,
+                               const DyDataType* const __restrict__ p_dy,
+                               ScaleDataType* const __restrict__ p_reduce_dscale,
+                               BiasDataType* const __restrict__ p_reduce_dbias)
+    {
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            in_welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            in_welford_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize * 1, true>
+            in_welford_count_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            welford_count_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>& mean_thread_buf =
+            welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>&
+            inv_var_thread_buf = welford_var_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dy_thread_buf;
+
+        // buffer of values of dy * (x-mean) * inv-variance, used as input of Blockwise reduction
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            tmp1_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            reduce_dscale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            reduce_dbias_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / blkgroup_size;
+        const index_t block_local_id  = block_global_id % blkgroup_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M           = Sequence<MThreadSliceSize>;
+        using ThreadBufferLengths_M_1         = Sequence<MThreadSliceSize, 1>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+
+        // clang-format off
+        // Step 1: load existing mean and inv-variance, or do final welford reduction on mean and variance as well as get inv-variance = 1/sqrt(epsilon+variance)
+        // clang-format on
+
+        if(haveSavedMeanInvVar)
+        {
+            const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_savedMean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_savedInvVar, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto threadwise_mean_inv_var_load =
+                ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                                 AccDataType,
+                                                 MeanVarGridDesc_M,
+                                                 decltype(thread_buffer_desc_m),
+                                                 ThreadBufferLengths_M,
+                                                 Sequence<0>,
+                                                 0,
+                                                 MeanVarSrcVectorSize,
+                                                 1,
+                                                 true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize));
+
+            threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
+                                             mean_global_buf,
+                                             thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             mean_thread_buf);
+
+            threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
+                                             inv_var_global_buf,
+                                             thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             inv_var_thread_buf);
+        }
+        else
+        {
+            const auto welford_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_in_welford_mean, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+            const auto welford_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_in_welford_variance, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+            const auto welford_count_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_in_welford_count, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+            auto threadwise_mean_var_load_m_k =
+                ThreadwiseTensorSliceTransfer_v2<AccDataType,
+                                                 AccDataType,
+                                                 MeanVarCountGridDesc_M_K,
+                                                 decltype(thread_buffer_desc_m_1),
+                                                 ThreadBufferLengths_M_1,
+                                                 Sequence<0, 1>,
+                                                 1,
+                                                 1,
+                                                 1,
+                                                 true>(
+                    mean_var_count_grid_desc_m_k,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize,
+                                     thread_k_cluster_id * 1));
+
+            auto threadwise_count_load_m_k =
+                ThreadwiseTensorSliceTransfer_v2<int32_t,
+                                                 int32_t,
+                                                 MeanVarCountGridDesc_M_K,
+                                                 decltype(thread_buffer_desc_m_1),
+                                                 ThreadBufferLengths_M_1,
+                                                 Sequence<0, 1>,
+                                                 1,
+                                                 1,
+                                                 1,
+                                                 true>(
+                    mean_var_count_grid_desc_m_k,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize,
+                                     thread_k_cluster_id * 1));
+
+            constexpr auto mean_var_count_thread_copy_step_m_k =
+                make_multi_index(0, KThreadClusterSize * 1);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                welford_mean_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+                welford_var_thread_buf(I)   = type_convert<AccDataType>(0.0f);
+                welford_count_thread_buf(I) = 0;
+            });
+
+            for(index_t reducedTiles = 0; reducedTiles < num_mean_var_count_k_block_tile_iteration;
+                ++reducedTiles)
+            {
+                threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                                 welford_mean_global_buf,
+                                                 thread_buffer_desc_m_1,
+                                                 make_tuple(I0, I0),
+                                                 in_welford_mean_thread_buf);
+
+                threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                                 welford_var_global_buf,
+                                                 thread_buffer_desc_m_1,
+                                                 make_tuple(I0, I0),
+                                                 in_welford_var_thread_buf);
+
+                threadwise_count_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                              welford_count_global_buf,
+                                              thread_buffer_desc_m_1,
+                                              make_tuple(I0, I0),
+                                              in_welford_count_thread_buf);
+
+                ThreadwiseWelford::Run(in_welford_mean_thread_buf,
+                                       in_welford_var_thread_buf,
+                                       in_welford_count_thread_buf,
+                                       welford_mean_thread_buf,
+                                       welford_var_thread_buf,
+                                       welford_count_thread_buf);
+
+                threadwise_mean_var_load_m_k.MoveSrcSliceWindow(
+                    mean_var_count_grid_desc_m_k, mean_var_count_thread_copy_step_m_k);
+                threadwise_count_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
+                                                             mean_var_count_thread_copy_step_m_k);
+            }
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();
+
+                BlockwiseWelford::Run(welford_mean_thread_buf(I),
+                                      welford_var_thread_buf(I),
+                                      welford_count_thread_buf(I));
+            });
+
+            // calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                welford_var_thread_buf(I) =
+                    type_convert<AccDataType>(1.0) / sqrt(welford_var_thread_buf[I] + epsilon);
+            });
+
+            if(block_local_id == 0 && thread_k_cluster_id == 0)
+            {
+
+                auto threadwise_mean_inv_var_store =
+                    ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                       MeanVarDataType,
+                                                       decltype(thread_buffer_desc_m),
+                                                       MeanVarGridDesc_M,
+                                                       PassThroughOp,
+                                                       ThreadBufferLengths_M,
+                                                       Sequence<0>,
+                                                       0,
+                                                       1,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       1,
+                                                       true>(
+                        mean_var_grid_desc_m,
+                        make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize),
+                        PassThroughOp{});
+
+                auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_welford_mean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+                auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_welford_inv_variance, mean_var_grid_desc_m.GetElementSpaceSize());
+
+                threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                                  make_tuple(I0),
+                                                  mean_thread_buf,
+                                                  mean_var_grid_desc_m,
+                                                  mean_global_buf);
+
+                threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                                  make_tuple(I0),
+                                                  inv_var_thread_buf,
+                                                  mean_var_grid_desc_m,
+                                                  inv_var_global_buf);
+            };
+        };
+
+        const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XDyVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
+                                                                   AccDataType,
+                                                                   XYGridDesc_M_K,
+                                                                   decltype(thread_buffer_desc_m_k),
+                                                                   ThreadBufferLengths_M_K,
+                                                                   ThreadBufferDimAccessOrder,
+                                                                   XDyVectorDim,
+                                                                   DySrcVectorSize,
+                                                                   1,
+                                                                   true>(
+            dy_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
+
+        constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            reduce_dscale_thread_buf(I) = type_convert<AccDataType>(0);
+            reduce_dbias_thread_buf(I)  = type_convert<AccDataType>(0);
+        });
+
+        // clang-format off
+        // Step 2: first-half of reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
+        // clang-format on
+
+        for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_dy_load.Run(dy_grid_desc_m_k,
+                                   dy_global_buf,
+                                   thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   dy_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    dy_elementwise_op(dy_thread_buf(Number<offset>{}),
+                                      dy_thread_buf[Number<offset>{}]);
+
+                    AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
+                                         inv_var_thread_buf[iM];
+
+                    tmp1_thread_buf(Number<offset>{}) = norm_x * dy_thread_buf[Number<offset>{}];
+                });
+            });
+
+            ThreadwiseReduce::Reduce(tmp1_thread_buf, reduce_dscale_thread_buf);
+            ThreadwiseReduce::Reduce(dy_thread_buf, reduce_dbias_thread_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k);
+            threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, xy_thread_copy_step_m_k);
+        };
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            BlockwiseReduce::Reduce(reduce_work_buf, reduce_dscale_thread_buf(I));
+            block_sync_lds();
+            BlockwiseReduce::Reduce(reduce_work_buf, reduce_dbias_thread_buf(I));
+        });
+
+        auto threadwise_dscale_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               ScaleDataType,
+                                               decltype(thread_buffer_desc_m_1),
+                                               DscaleDbiasGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dscale_dbias_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+
+        auto threadwise_dbias_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               BiasDataType,
+                                               decltype(thread_buffer_desc_m_1),
+                                               DscaleDbiasGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dscale_dbias_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+
+        auto reduce_dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_reduce_dscale, dscale_dbias_grid_desc_m_g.GetElementSpaceSize());
+
+        auto reduce_dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_reduce_dbias, dscale_dbias_grid_desc_m_g.GetElementSpaceSize());
+
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_dscale_store.Run(thread_buffer_desc_m_1,
+                                        make_tuple(I0, I0),
+                                        reduce_dscale_thread_buf,
+                                        dscale_dbias_grid_desc_m_g,
+                                        reduce_dscale_global_buf);
+
+            threadwise_dbias_store.Run(thread_buffer_desc_m_1,
+                                       make_tuple(I0, I0),
+                                       reduce_dbias_thread_buf,
+                                       dscale_dbias_grid_desc_m_g,
+                                       reduce_dbias_global_buf);
+        };
+    };
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseBatchrNormBackwardWithBlockwiseWelford_,
+          typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          typename GetReduceCountPerThreadFunctor>
+__global__ void kernel_batchnorm_backward_with_blockwise_welford(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K dy_grid_desc_m_k,
+    const XYGridDesc_M_K dx_grid_desc_m_k,
+    const ScaleBiasGridDesc_M scale_grid_desc_m,
+    const ScaleBiasGridDesc_M bias_grid_desc_m,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
+    long_index_t reduce_size,
+    index_t num_k_block_tile_iteration,
+    AccDataType epsilon,
+    const XDataType* const __restrict__ p_x,
+    const DyDataType* const __restrict__ p_dy,
+    const ScaleDataType* const __restrict__ p_scale,
+    bool haveSavedMeanInvVar,
+    const MeanVarDataType* const __restrict__ p_savedMean,
+    const MeanVarDataType* const __restrict__ p_savedInvVar,
+    const DyElementwiseOp dy_elementwise_op,
+    DxDataType* const __restrict__ p_dx,
+    ScaleDataType* const __restrict__ p_dscale,
+    BiasDataType* const __restrict__ p_dbias)
+{
+    GridwiseBatchrNormBackwardWithBlockwiseWelford_::Run(x_grid_desc_m_k,
+                                                         dy_grid_desc_m_k,
+                                                         dx_grid_desc_m_k,
+                                                         scale_grid_desc_m,
+                                                         bias_grid_desc_m,
+                                                         mean_var_grid_desc_m,
+                                                         get_reduce_count_per_thread,
+                                                         reduce_size,
+                                                         num_k_block_tile_iteration,
+                                                         epsilon,
+                                                         p_x,
+                                                         p_dy,
+                                                         p_scale,
+                                                         haveSavedMeanInvVar,
+                                                         p_savedMean,
+                                                         p_savedInvVar,
+                                                         dy_elementwise_op,
+                                                         p_dx,
+                                                         p_dscale,
+                                                         p_dbias);
+};
+
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          typename GetReduceCountPerThreadFunctor,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XDyDxVectorDim,
+          index_t XSrcVectorSize,
+          index_t DySrcVectorSize,
+          index_t DxDstVectorSize,
+          index_t ScaleSrcDstVectorSize,
+          index_t BiasDstVectorSize,
+          index_t MeanVarSrcVectorSize>
+struct GridwiseBatchNormBackwardWithBlockwiseWelford
+{
+    static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
+                   MThreadSliceSize % DySrcVectorSize == 0 &&
+                   MThreadSliceSize % DxDstVectorSize == 0) ||
+                      (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
+                       KThreadSliceSize % DySrcVectorSize == 0 &&
+                       KThreadSliceSize % DxDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XDyDxVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ck::reduce::Add,
+                                                          false>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_K,
+                                                 ThreadReduceDstDesc_M,
+                                                 ck::reduce::Add,
+                                                 false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    // clang-format off
+    // Blockwise BatchNorm Backward
+    // Input: x, dy, scale, savedMean and savedInvVar (optional), reduce_size
+    // Output: dx, dscale, dbias
+    // Step 1: calculating mean and inv-variance using welford method (if savedMean/savedInvVar not available), where inv-variance = 1/sqrt(epsilon+variance)
+    // Step 2: reduction: dbias = sum(dy),  dscale = sum(dy *(x-mean) * inv-variance)
+    // Step 3: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
+    // clang-format on
+    __device__ static void Run(const XYGridDesc_M_K x_grid_desc_m_k,
+                               const XYGridDesc_M_K dy_grid_desc_m_k,
+                               const XYGridDesc_M_K dx_grid_desc_m_k,
+                               const ScaleBiasGridDesc_M scale_grid_desc_m,
+                               const ScaleBiasGridDesc_M bias_grid_desc_m,
+                               const MeanVarGridDesc_M mean_var_grid_desc_m,
+                               const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
+                               long_index_t reduce_size,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               const XDataType* const __restrict__ p_x,
+                               const DyDataType* const __restrict__ p_dy,
+                               const ScaleDataType* const __restrict__ p_scale,
+                               bool haveSavedMeanInvVar,
+                               const MeanVarDataType* const __restrict__ p_savedMean,
+                               const MeanVarDataType* const __restrict__ p_savedInvVar,
+                               const DyElementwiseOp dy_elementwise_op,
+                               DxDataType* const __restrict__ p_dx,
+                               ScaleDataType* const __restrict__ p_dscale,
+                               BiasDataType* const __restrict__ p_dbias)
+    {
+        using ck::math::sqrt;
+
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dy_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dx_thread_buf;
+
+        // buffer of values of dy * (x-mean) * invVariance, used as input of Blockwise reduction
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            tmp1_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>&
+            inv_var_thread_buf = var_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dscale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dbias_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M           = Sequence<MThreadSliceSize>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XDyDxVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
+                                                                   AccDataType,
+                                                                   XYGridDesc_M_K,
+                                                                   decltype(thread_buffer_desc_m_k),
+                                                                   ThreadBufferLengths_M_K,
+                                                                   ThreadBufferDimAccessOrder,
+                                                                   XDyDxVectorDim,
+                                                                   XSrcVectorSize,
+                                                                   1,
+                                                                   true>(
+            x_grid_desc_m_k,
+            make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dx_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               DxDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGridDesc_M_K,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               XDyDxVectorDim,
+                                               DxDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dy_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_scale_load =
+            ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
+                                             AccDataType,
+                                             ScaleBiasGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             ScaleSrcDstVectorSize,
+                                             1,
+                                             true>(
+                scale_grid_desc_m,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+
+        auto threadwise_dscale_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               ScaleDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               ScaleBiasGridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,
+                                               0,
+                                               ScaleSrcDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                scale_grid_desc_m,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_dbias_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               BiasDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               ScaleBiasGridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,
+                                               0,
+                                               BiasDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                bias_grid_desc_m,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
+        constexpr auto thread_copy_bwd_step_m_k = make_multi_index(0, -K_BlockTileSize);
+
+        const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
+
+        auto dx_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dx, dx_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto scale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_scale, scale_grid_desc_m.GetElementSpaceSize());
+
+        auto dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dscale, scale_grid_desc_m.GetElementSpaceSize());
+
+        auto dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dbias, bias_grid_desc_m.GetElementSpaceSize());
+
+        // clang-format off
+        // Step 1: calculating mean and inv-variance using welford method (if savedMean/savedInvVar not available), where inv-variance = 1/sqrt(epsilon+variance)
+        // clang-format on
+
+        if(haveSavedMeanInvVar)
+        {
+            const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_savedMean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_savedInvVar, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto threadwise_mean_inv_var_load =
+                ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                                 AccDataType,
+                                                 MeanVarGridDesc_M,
+                                                 decltype(thread_buffer_desc_m),
+                                                 ThreadBufferLengths_M,
+                                                 Sequence<0>,
+                                                 0,
+                                                 MeanVarSrcVectorSize,
+                                                 1,
+                                                 true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize));
+
+            threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
+                                             mean_global_buf,
+                                             thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             mean_thread_buf);
+
+            threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
+                                             inv_var_global_buf,
+                                             thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             inv_var_thread_buf);
+        }
+        else
+        {
+            auto threadwise_welford       = ThreadwiseWelford();
+            threadwise_welford.max_count_ = get_reduce_count_per_thread(thread_k_cluster_id);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+                var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+            });
+
+            for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+            {
+
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf);
+
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                threadwise_welford.Run(x_thread_buf, mean_thread_buf, var_thread_buf);
+            }
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();
+
+                int count = threadwise_welford.cur_count_;
+                BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
+            });
+
+            // calculate inv-variance as 1/sqrt(epsilon+variance)
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                inv_var_thread_buf(I) =
+                    type_convert<AccDataType>(1.0) / sqrt(var_thread_buf[I] + epsilon);
+            });
+
+            threadwise_x_load.SetSrcSliceOrigin(
+                x_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize));
+        };
+
+        // clang-format off
+        // Step 2: reduction: dbias = sum(dy),  dscale = sum(dy *(x-mean) * inv-variance)
+        // clang-format on
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            dscale_thread_buf(I) = type_convert<AccDataType>(0);
+            dbias_thread_buf(I)  = type_convert<AccDataType>(0);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_dy_load.Run(dx_grid_desc_m_k,
+                                   dy_global_buf,
+                                   thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   dy_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    dy_elementwise_op(dy_thread_buf(Number<offset>{}),
+                                      dy_thread_buf[Number<offset>{}]);
+
+                    AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
+                                         inv_var_thread_buf[iM];
+
+                    tmp1_thread_buf(Number<offset>{}) = norm_x * dy_thread_buf[Number<offset>{}];
+                });
+            });
+
+            ThreadwiseReduce::Reduce(tmp1_thread_buf, dscale_thread_buf);
+            ThreadwiseReduce::Reduce(dy_thread_buf, dbias_thread_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_fwd_step_m_k);
+        };
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+            BlockwiseReduce::Reduce(reduce_work_buf, dscale_thread_buf(I));
+            block_sync_lds();
+            BlockwiseReduce::Reduce(reduce_work_buf, dbias_thread_buf(I));
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_dscale_store.Run(thread_buffer_desc_m,
+                                        make_tuple(I0),
+                                        dscale_thread_buf,
+                                        scale_grid_desc_m,
+                                        dscale_global_buf);
+
+            threadwise_dbias_store.Run(thread_buffer_desc_m,
+                                       make_tuple(I0),
+                                       dbias_thread_buf,
+                                       bias_grid_desc_m,
+                                       dbias_global_buf);
+        };
+
+        // clang-format off
+        // Step 3: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
+        // clang-format on
+
+        threadwise_scale_load.Run(scale_grid_desc_m,
+                                  scale_global_buf,
+                                  thread_buffer_desc_m,
+                                  make_tuple(I0),
+                                  scale_thread_buf);
+
+        auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k;
+
+        threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_tail_m_k);
+
+        AccDataType inv_reduce_size =
+            type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size);
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_dy_load.Run(dy_grid_desc_m_k,
+                                   dy_global_buf,
+                                   thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   dy_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                AccDataType multiplier =
+                    inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM];
+
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    dy_elementwise_op(dy_thread_buf(Number<offset>{}),
+                                      dy_thread_buf[Number<offset>{}]);
+
+                    AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
+                                         inv_var_thread_buf[iM];
+
+                    AccDataType tmpVal = norm_x * dscale_thread_buf[iM];
+
+                    dx_thread_buf(Number<offset>{}) =
+                        multiplier *
+                        (type_convert<AccDataType>(reduce_size) * dy_thread_buf[Number<offset>{}] -
+                         dbias_thread_buf[iM] - tmpVal);
+                });
+            });
+
+            threadwise_dx_store.Run(thread_buffer_desc_m_k,
+                                    make_tuple(I0, I0),
+                                    dx_thread_buf,
+                                    dx_grid_desc_m_k,
+                                    dx_global_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        }
+    }
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
@@ -441,6 +441,7 @@ struct GridwiseBatchNormForwardWithBlockwiseWelford
            auto result_inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize());

+            // calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance
            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
                var_thread_buf(I) =
                    type_convert<AccDataType>(1.0f) / sqrt(epsilon + var_thread_buf[I]);
--- a/include/ck/tensor_operation/gpu/grid/gridwise_multiblock_welford_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_multiblock_welford_first_half.hpp
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseMultiblockWelfordFirstHalf_,
+          typename XDataType,
+          typename MeanVarDataType,
+          typename XGridDesc_M_K,
+          typename MeanVarCountGridDesc_M_G,
+          typename GetReduceCountPerThreadFunctor>
+__global__ void kernel_multiblock_welford_first_half(
+    const XGridDesc_M_K x_grid_desc_m_k,
+    const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g,
+    const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
+    index_t num_k_block_tile_iteration,
+    const XDataType* const __restrict__ p_x,
+    MeanVarDataType* const p_welford_mean,
+    MeanVarDataType* const p_welford_variance,
+    int32_t* const p_welford_count)
+{
+    GridwiseMultiblockWelfordFirstHalf_::Run(x_grid_desc_m_k,
+                                             mean_var_count_grid_desc_m_g,
+                                             get_reduce_count_per_thread,
+                                             num_k_block_tile_iteration,
+                                             p_x,
+                                             p_welford_mean,
+                                             p_welford_variance,
+                                             p_welford_count);
+};
+
+template <typename XDataType,
+          typename AccDataType,
+          typename MeanVarDataType,
+          typename XGridDesc_M_K,
+          typename MeanVarCountGridDesc_M_G,
+          typename GetReduceCountPerThreadFunctor,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcCountSrcVectorDim,
+          index_t XSrcCountSrcVectorSize>
+struct GridwiseMultiblockWelfordFirstHalf
+{
+    static_assert((XSrcCountSrcVectorDim == 0 && MThreadSliceSize % XSrcCountSrcVectorSize == 0) ||
+                      (XSrcCountSrcVectorDim == 1 &&
+                       KThreadSliceSize % XSrcCountSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XSrcCountSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder,
+                                              false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    __device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k,
+                               const MeanVarCountGridDesc_M_G& mean_var_count_grid_desc_m_g,
+                               const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread,
+                               index_t num_k_block_tile_iteration,
+                               const XDataType* const __restrict__ p_x,
+                               MeanVarDataType* const p_welford_mean,
+                               MeanVarDataType* const p_welford_variance,
+                               int32_t* const p_welford_count)
+    {
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            welford_count_thread_buf;
+
+        const index_t blkgroup_size = mean_var_count_grid_desc_m_g.GetLength(I1);
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / blkgroup_size;
+        const index_t block_local_id  = block_global_id % blkgroup_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
+
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcCountSrcVectorDim,
+                                                                  XSrcCountSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_welford_mean_var_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               MeanVarDataType,
+                                               decltype(thread_buffer_desc_m_1),
+                                               MeanVarCountGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                mean_var_count_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+
+        auto threadwise_welford_count_store =
+            ThreadwiseTensorSliceTransfer_v1r3<int32_t,
+                                               int32_t,
+                                               decltype(thread_buffer_desc_m_1),
+                                               MeanVarCountGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                mean_var_count_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        auto welford_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_welford_mean, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+
+        auto welford_var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_welford_variance, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+
+        auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_welford_count, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+
+        auto threadwise_welford = ThreadwiseWelford();
+        threadwise_welford.max_count_ =
+            get_reduce_count_per_thread(block_local_id, thread_k_cluster_id);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            welford_mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            welford_var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_val_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            threadwise_welford.Run(x_thread_buf, welford_mean_thread_buf, welford_var_thread_buf);
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            welford_count_thread_buf(I) = threadwise_welford.cur_count_;
+            BlockwiseWelford::Run(
+                welford_mean_thread_buf(I), welford_var_thread_buf(I), welford_count_thread_buf(I));
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  welford_mean_thread_buf,
+                                                  mean_var_count_grid_desc_m_g,
+                                                  welford_mean_global_val_buf);
+
+            threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  welford_var_thread_buf,
+                                                  mean_var_count_grid_desc_m_g,
+                                                  welford_var_global_val_buf);
+
+            threadwise_welford_count_store.Run(thread_buffer_desc_m_1,
+                                               make_tuple(I0, I0),
+                                               welford_count_thread_buf,
+                                               mean_var_count_grid_desc_m_g,
+                                               welford_count_global_val_buf);
+        };
+    }
+};
+
+} // namespace ck
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp>
+struct ReferenceBatchNormBwd_Input_N_H_W_C_Output_C
+    : public device::DeviceBatchNormBwd<4, 3, DyElementwiseOp>
+{
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const std::array<index_t, 4> xyLengths,
+                 const std::array<index_t, 4> xStrides,
+                 const std::array<index_t, 4> dyStrides,
+                 const std::array<index_t, 4> dxStrides,
+                 const std::array<int, 3> reduceDims,
+                 const std::array<ck::index_t, 1> bnScaleBiasMeanVarLengths,
+                 const std::array<ck::index_t, 1> bnScaleStrides,
+                 const std::array<ck::index_t, 1> bnBiasStrides,
+                 const std::array<ck::index_t, 1> bnMeanVarStrides,
+                 const XDataType* p_x,
+                 const DyDataType* p_dy,
+                 const ScaleDataType* p_scale,
+                 const MeanVarDataType* p_savedMean,
+                 const MeanVarDataType* p_savedInvVar,
+                 double epsilon,
+                 const DyElementwiseOp dy_elementwise_op,
+                 DxDataType* p_dx,
+                 ScaleDataType* p_dscale,
+                 BiasDataType* p_dbias)
+            : p_x_(p_x),
+              p_dy_(p_dy),
+              p_scale_(p_scale),
+              p_savedMean_(p_savedMean),
+              p_savedInvVar_(p_savedInvVar),
+              epsilon_(epsilon),
+              dy_elementwise_op_(dy_elementwise_op),
+              p_dx_(p_dx),
+              p_dscale_(p_dscale),
+              p_dbias_(p_dbias)
+        {
+            ignore = xStrides;
+            ignore = dyStrides;
+            ignore = dxStrides;
+            ignore = bnScaleStrides;
+            ignore = bnBiasStrides;
+            ignore = bnMeanVarStrides;
+
+            if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 ||
+               bnScaleBiasMeanVarLengths[0] != xyLengths[3])
+                throw std::runtime_error("Invalid tensor dimensions!");
+
+            if(reduceDims[0] != 0 || reduceDims[1] != 1 || reduceDims[2] != 2)
+                throw std::runtime_error("Invalid reduce dimensions!");
+
+            n_ = xyLengths[0];
+            h_ = xyLengths[1];
+            w_ = xyLengths[2];
+            c_ = xyLengths[3];
+
+            haveSavedMeanInvVar_ = (p_savedMean != nullptr && p_savedInvVar != nullptr);
+        }
+
+        const XDataType* p_x_;
+        const DyDataType* p_dy_;
+        const ScaleDataType* p_scale_;
+        const MeanVarDataType* p_savedMean_;
+        const MeanVarDataType* p_savedInvVar_;
+
+        double epsilon_;
+        const DyElementwiseOp dy_elementwise_op_;
+
+        DxDataType* p_dx_;
+        ScaleDataType* p_dscale_;
+        BiasDataType* p_dbias_;
+
+        bool haveSavedMeanInvVar_;
+
+        index_t n_, h_, w_, c_;
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            auto thread_reduce_func = [&](auto iC) {
+                AccDataType reduceSize = type_convert<AccDataType>(arg.n_) *
+                                         type_convert<AccDataType>(arg.h_) *
+                                         type_convert<AccDataType>(arg.w_);
+                index_t offset_C = iC;
+                AccDataType mean;
+                AccDataType invVar;
+
+                if(arg.haveSavedMeanInvVar_)
+                {
+                    mean   = arg.p_savedMean_[offset_C];
+                    invVar = arg.p_savedInvVar_[offset_C];
+                }
+                else
+                {
+                    AccDataType meansquare;
+
+                    meansquare = type_convert<AccDataType>(0.0f);
+                    mean       = type_convert<AccDataType>(0.0f);
+
+                    // compute mean, meanquare, variance, inv-variance
+                    for(index_t iN = 0; iN < arg.n_; iN++)
+                    {
+                        index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_;
+                        for(index_t iH = 0; iH < arg.h_; iH++)
+                        {
+                            index_t offset_H = iH * arg.w_ * arg.c_;
+                            for(index_t iW = 0; iW < arg.w_; iW++)
+                            {
+                                index_t offset_W = iW * arg.c_;
+
+                                auto offset = offset_N + offset_H + offset_W + offset_C;
+
+                                AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
+
+                                mean += x;
+                                meansquare += x * x;
+                            };
+                        }
+                    };
+
+                    mean       = mean / reduceSize;
+                    meansquare = meansquare / reduceSize;
+
+                    AccDataType variance = meansquare - mean * mean;
+                    invVar               = type_convert<AccDataType>(1.0f) /
+                             std::sqrt(type_convert<AccDataType>(arg.epsilon_) + variance);
+                };
+
+                AccDataType dbias  = type_convert<AccDataType>(0.0f); // Sum on NHW of dy
+                AccDataType dscale = type_convert<AccDataType>(0.0f); // Sum on NHW of dy * norm_x
+
+                // 1) calculate dy * (x - mean) * inv-variance
+                // 2) calculate sum(dy) on NHW dimensions
+                // 3) calculate sum(dy * norm_x) on NHW dimensions
+                for(index_t iN = 0; iN < arg.n_; iN++)
+                {
+                    index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_;
+                    for(index_t iH = 0; iH < arg.h_; iH++)
+                    {
+                        index_t offset_H = iH * arg.w_ * arg.c_;
+                        for(index_t iW = 0; iW < arg.w_; iW++)
+                        {
+                            index_t offset_W = iW * arg.c_;
+
+                            auto offset = offset_N + offset_H + offset_W + offset_C;
+
+                            AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
+
+                            AccDataType norm_x = (x - mean) * invVar;
+                            AccDataType dy     = type_convert<AccDataType>(arg.p_dy_[offset]);
+
+                            arg.dy_elementwise_op_(dy, dy);
+
+                            dbias += dy;
+                            dscale += norm_x * dy;
+                        };
+                    }
+                };
+
+                arg.p_dscale_[offset_C] = type_convert<ScaleDataType>(dscale);
+                arg.p_dbias_[offset_C]  = type_convert<BiasDataType>(dbias);
+
+                AccDataType scale = type_convert<AccDataType>(arg.p_scale_[offset_C]);
+                AccDataType multiplier =
+                    type_convert<AccDataType>(1.0f) / reduceSize * invVar * scale;
+
+                // 1) calculate tmp = dscale * (x - mean) * inv-variance
+                // 2) calculate dx = 1/nhw * inv-variance * scale * (nhw * dy - dbias - tmp)
+                for(index_t iN = 0; iN < arg.n_; iN++)
+                {
+                    index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_;
+                    for(index_t iH = 0; iH < arg.h_; iH++)
+                    {
+                        index_t offset_H = iH * arg.w_ * arg.c_;
+                        for(index_t iW = 0; iW < arg.w_; iW++)
+                        {
+                            index_t offset_W = iW * arg.c_;
+
+                            auto offset = offset_N + offset_H + offset_W + offset_C;
+
+                            AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
+
+                            AccDataType norm_x = (x - mean) * invVar;
+                            AccDataType dy     = type_convert<AccDataType>(arg.p_dy_[offset]);
+
+                            arg.dy_elementwise_op_(dy, dy);
+
+                            AccDataType tmpVal = norm_x * dscale;
+
+                            AccDataType dx = multiplier * (reduceSize * dy - dbias - tmpVal);
+
+                            arg.p_dx_[offset] = type_convert<XDataType>(dx);
+                        };
+                    }
+                };
+            };
+
+            std::size_t num_thread      = std::thread::hardware_concurrency();
+            std::size_t work_per_thread = (arg.c_ + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t ic_begin = it * work_per_thread;
+                std::size_t ic_end = std::min(static_cast<int>((it + 1) * work_per_thread), arg.c_);
+
+                auto f = [=] {
+                    for(std::size_t ic = ic_begin; ic < ic_end; ++ic)
+                    {
+                        thread_reduce_func(ic);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+
+            return (0.0f);
+        };
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        };
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        (void)p_arg;
+
+        return (true);
+    };
+
+    std::unique_ptr<device::BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
+                        const std::array<index_t, 4> xStrides,
+                        const std::array<index_t, 4> dyStrides,
+                        const std::array<index_t, 4> dxStrides,
+                        const std::array<int, 3> reduceDims,
+                        const std::array<ck::index_t, 1> bnScaleBiasMeanVarLengths,
+                        const std::array<ck::index_t, 1> bnScaleStrides,
+                        const std::array<ck::index_t, 1> bnBiasStrides,
+                        const std::array<ck::index_t, 1> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* p_dy,
+                        const void* p_scale,
+                        const void* p_savedMean,
+                        const void* p_savedInvVar,
+                        double epsilon,
+                        const DyElementwiseOp dy_elementwise_op,
+                        void* p_dx,
+                        void* p_dscale,
+                        void* p_dbias) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          dyStrides,
+                                          dxStrides,
+                                          reduceDims,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleStrides,
+                                          bnBiasStrides,
+                                          bnMeanVarStrides,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const DyDataType*>(p_dy),
+                                          static_cast<const ScaleDataType*>(p_scale),
+                                          static_cast<const MeanVarDataType*>(p_savedMean),
+                                          static_cast<const MeanVarDataType*>(p_savedInvVar),
+                                          epsilon,
+                                          dy_elementwise_op,
+                                          static_cast<DxDataType*>(p_dx),
+                                          static_cast<ScaleDataType*>(p_dscale),
+                                          static_cast<BiasDataType*>(p_dbias));
+    };
+
+    std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "Reference_BatchNorm_Backward_NHWC_C<" << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck