Restructure gridwise and blockwise GEMM, add tensor contraction and FWD-v4r5 (#36)

* experimenting magic number division * overhauling fwd-v4r4 to clearly reflect transformation graph * added fwd-v4r5 * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * bug fix and added sanity-check in transform_dynamic_tensor_descriptor * added conv_driver_v2 [ROCm/composable_kernel commit: 30072aec37]
2026-05-24 14:54:47 +00:00 · 2021-06-09 23:53:08 -05:00
parent 040023fdcd
commit f4acec502e
38 changed files with 4791 additions and 2050 deletions
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
@@ -6,58 +6,94 @@ template <class TIn,
          class TOut,
          class ConvStrides,
          class ConvDilations,
-          class LowerPads,
-          class UpperPads>
-void host_direct_convolution(const Tensor<TIn>& in_nchw,
-                             const Tensor<TWei>& wei_kcyx,
-                             Tensor<TOut>& out_nkhw,
-                             ConvStrides,
-                             ConvDilations,
-                             LowerPads,
-                             UpperPads)
+          class InLeftPads,
+          class InRightPads>
+void host_direct_convolution(const Tensor<TIn>& in,
+                             const Tensor<TWei>& wei,
+                             Tensor<TOut>& out,
+                             const ConvStrides& conv_strides,
+                             const ConvDilations& conv_dilations,
+                             const InLeftPads& in_left_pads,
+                             const InRightPads& in_right_pads,
+                             const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
    using namespace ck;

-    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
-    index_t w_pad_low = LowerPads{}.Get(Number<1>{});
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};

-    auto f = [&](auto n, auto k, auto ho, auto wo) {
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
        double v = 0;
-        for(int c = 0; c < wei_kcyx.mDesc.GetLengths()[1]; ++c)
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
        {
-            for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y)
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
            {
-                int hi = ho * ConvStrides{}[0] + y * ConvDilations{}[0] - h_pad_low;
-                for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x)
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
                {
-                    int wi = wo * ConvStrides{}[1] + x * ConvDilations{}[1] - w_pad_low;
-                    if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in_nchw.mDesc.GetLengths()[3])
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
                    {
-                        v += static_cast<const double>(in_nchw(n, c, hi, wi)) *
-                             static_cast<const double>(wei_kcyx(k, c, y, x));
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(wei(k, c, y, x));
                    }
                }
            }
        }
-        out_nkhw(n, k, ho, wo) = v;
+        out(n, k, ho, wo) = v;
    };

-    auto f_par = make_ParallelTensorFunctor(f,
-                                            out_nkhw.mDesc.GetLengths()[0],
-                                            out_nkhw.mDesc.GetLengths()[1],
-                                            out_nkhw.mDesc.GetLengths()[2],
-                                            out_nkhw.mDesc.GetLengths()[3]);
+    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(wei(k, y, x, c));
+                    }
+                }
+            }
+        }
+        out(n, ho, wo, k) = v;
+    };

-    f_par(std::thread::hardware_concurrency());
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
 }

-template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
+template <class TIn, class TWei, class TOut, class InLeftPads, class InRightPads>
 void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
                                   const Tensor<TWei>& wei_kcyx,
                                   Tensor<TOut>& out_nkhw,
-                                   LowerPads,
-                                   UpperPads)
+                                   InLeftPads,
+                                   InRightPads)
 {
    using namespace ck;

@@ -76,8 +112,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];

-    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
-    index_t w_pad_low = LowerPads{}.Get(Number<1>{});
+    index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
+    index_t w_pad_low = InLeftPads{}.Get(Number<1>{});

    std::size_t HiPerTile = HoPerTile + Y - 1;
    std::size_t WiPerTile = WoPerTile + X - 1;