xdlops_v4r4_fwd fp32/fp16 (#34)

* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 3835318cc3]
2026-05-26 08:00:13 +00:00 · 2021-07-01 14:33:00 -05:00
parent 817b2a47c6
commit 67dcc552b6
54 changed files with 9813 additions and 245 deletions
--- a/driver/include/host_conv_bwd_data.hpp
+++ b/driver/include/host_conv_bwd_data.hpp
@@ -6,56 +6,62 @@ template <typename TIn,
          typename TOut,
          typename ConvStrides,
          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads>
-void host_direct_convolution_backward_data(Tensor<TIn>& in_nchw,
-                                           const Tensor<TWei>& wei_kcyx,
-                                           const Tensor<TOut>& out_nkhw,
-                                           ConvStrides,
-                                           ConvDilations,
-                                           LeftPads,
-                                           RightPads)
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_backward_data(Tensor<TIn>& in,
+                                           const Tensor<TWei>& wei,
+                                           const Tensor<TOut>& out,
+                                           const ConvStrides& conv_strides,
+                                           const ConvDilations& conv_dilations,
+                                           const InLeftPads& in_left_pads,
+                                           const InRightPads& in_right_pads,
+                                           const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
    using namespace ck;

-    int N  = in_nchw.mDesc.GetLengths()[0];
-    int C  = in_nchw.mDesc.GetLengths()[1];
-    int HI = in_nchw.mDesc.GetLengths()[2];
-    int WI = in_nchw.mDesc.GetLengths()[3];
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};

-    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
-    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
-    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
+    auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
+        std::size_t N  = in.mDesc.GetLengths()[I0];
+        std::size_t C  = in.mDesc.GetLengths()[I1];
+        std::size_t Hi = in.mDesc.GetLengths()[I2];
+        std::size_t Wi = in.mDesc.GetLengths()[I3];

-    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
-    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+        std::size_t K = wei.mDesc.GetLengths()[I0];
+        std::size_t Y = wei.mDesc.GetLengths()[I2];
+        std::size_t X = wei.mDesc.GetLengths()[I3];
+
+        std::size_t Ho = out.mDesc.GetLengths()[I2];
+        std::size_t Wo = out.mDesc.GetLengths()[I3];

-    auto f = [&](auto n, auto c, auto hi, auto wi) {
        double v = 0;

        for(int y = 0; y < Y; ++y)
        {
-            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
+            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];

-            if(h_tmp % ConvStrides{}[0] == 0)
+            if(h_tmp % conv_strides[I0] == 0)
            {
-                int ho = h_tmp / ConvStrides{}[0];
+                int ho = h_tmp / conv_strides[I0];

-                if(ho >= 0 && ho < HO)
+                if(ho >= 0 && ho < Ho)
                {
                    for(int x = 0; x < X; ++x)
                    {
-                        int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];

-                        if(w_tmp % ConvStrides{}[1] == 0)
+                        if(w_tmp % conv_strides[I1] == 0)
                        {
-                            int wo = w_tmp / ConvStrides{}[1];
+                            int wo = w_tmp / conv_strides[I1];

-                            if(wo >= 0 && wo < WO)
+                            if(wo >= 0 && wo < Wo)
                            {
                                for(int k = 0; k < K; ++k)
                                {
-                                    v += out_nkhw(n, k, ho, wo) * wei_kcyx(k, c, y, x);
+                                    v += out(n, k, ho, wo) * wei(k, c, y, x);
                                }
                            }
                        }
@@ -64,14 +70,74 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in_nchw,
            }
        }

-        in_nchw(n, c, hi, wi) = v;
+        in(n, c, hi, wi) = v;
    };

-    auto f_par = make_ParallelTensorFunctor(f,
-                                            in_nchw.mDesc.GetLengths()[0],
-                                            in_nchw.mDesc.GetLengths()[1],
-                                            in_nchw.mDesc.GetLengths()[2],
-                                            in_nchw.mDesc.GetLengths()[3]);
+    auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
+        std::size_t N  = in.mDesc.GetLengths()[I0];
+        std::size_t Hi = in.mDesc.GetLengths()[I1];
+        std::size_t Wi = in.mDesc.GetLengths()[I2];
+        std::size_t C  = in.mDesc.GetLengths()[I3];

-    f_par(std::thread::hardware_concurrency());
+        std::size_t K = wei.mDesc.GetLengths()[I0];
+        std::size_t Y = wei.mDesc.GetLengths()[I1];
+        std::size_t X = wei.mDesc.GetLengths()[I2];
+
+        std::size_t Ho = out.mDesc.GetLengths()[I1];
+        std::size_t Wo = out.mDesc.GetLengths()[I2];
+
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
+
+            if(h_tmp % conv_strides[I0] == 0)
+            {
+                int ho = h_tmp / conv_strides[I0];
+
+                if(ho >= 0 && ho < Ho)
+                {
+                    for(int x = 0; x < X; ++x)
+                    {
+                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
+
+                        if(w_tmp % conv_strides[I1] == 0)
+                        {
+                            int wo = w_tmp / conv_strides[I1];
+
+                            if(wo >= 0 && wo < Wo)
+                            {
+                                for(int k = 0; k < K; ++k)
+                                {
+                                    v += out(n, ho, wo, k) * wei(k, y, x, c);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        in(n, hi, wi, c) = v;
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   in.mDesc.GetLengths()[0],
+                                   in.mDesc.GetLengths()[1],
+                                   in.mDesc.GetLengths()[2],
+                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   in.mDesc.GetLengths()[0],
+                                   in.mDesc.GetLengths()[1],
+                                   in.mDesc.GetLengths()[2],
+                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
 }