refactor

[ROCm/composable_kernel commit: f7be86b9e4]
2026-05-14 18:17:44 +00:00 · 2019-09-16 22:47:55 -05:00
parent ff99bc281e
commit 2b26fb76ca
6 changed files with 110 additions and 87 deletions
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -158,24 +158,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
        //     slice a merged tensor, reorder and copy to a normal tensor
        //     this copy operator already has blockwise offset built-in
        auto blockwise_in_copy =
-#if 0
-            BlockwiseGenericTensorSliceCopy_v1
-#else
-            BlockwiseGenericTensorSliceCopy_v2
-#endif
-            <BlockSize,
-             decltype(in_e_n1_b_n2_global_merged_desc),
-             decltype(in_e_n1_b_n2_block_desc),
-             decltype(in_e_n1_b_n2_block_desc.GetLengths()),
-             InBlockCopySubLengths_E_N1_B_N2,
-             InBlockCopyClusterLengths_E_N1_B_N2,
-             InBlockCopyThreadClusterArrangeOrder,
-             InBlockCopySrcAccessOrder,
-             InBlockCopyDstAccessOrder,
-             2,
-             3,
-             InBlockCopySrcDataPerRead_B,
-             InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
+                                               decltype(in_e_n1_b_n2_global_merged_desc),
+                                               decltype(in_e_n1_b_n2_block_desc),
+                                               decltype(in_e_n1_b_n2_block_desc.GetLengths()),
+                                               InBlockCopySubLengths_E_N1_B_N2,
+                                               InBlockCopyClusterLengths_E_N1_B_N2,
+                                               InBlockCopyThreadClusterArrangeOrder,
+                                               InBlockCopySrcAccessOrder,
+                                               InBlockCopyDstAccessOrder,
+                                               2,
+                                               3,
+                                               InBlockCopySrcDataPerRead_B,
+                                               InBlockCopyDstDataPerWrite_N2>(
+                {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});

        // weight tensor
        //     tensor descriptor in device memory, src of blockwise copy
@@ -192,24 +188,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
        //     slice a tensor, and copy it into another tensor
        //     this copy operator already have blockwise offset built-in
        auto blockwise_wei_copy =
-#if 0
-            BlockwiseGenericTensorSliceCopy_v1
-#else
-            BlockwiseGenericTensorSliceCopy_v2
-#endif
-            <BlockSize,
-             decltype(wei_e_k_global_desc),
-             decltype(wei_e_k_block_desc),
-             decltype(wei_e_k_block_desc.GetLengths()),
-             WeiBlockCopySubLengths_E_K,
-             WeiBlockCopyClusterLengths_E_K,
-             WeiBlockCopyThreadClusterArrangeOrder,
-             WeiBlockCopySrcAccessOrder,
-             WeiBlockCopyDstAccessOrder,
-             0,
-             1,
-             WeiBlockCopySrcDataPerRead_E,
-             WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0});
+            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
+                                               decltype(wei_e_k_global_desc),
+                                               decltype(wei_e_k_block_desc),
+                                               decltype(wei_e_k_block_desc.GetLengths()),
+                                               WeiBlockCopySubLengths_E_K,
+                                               WeiBlockCopyClusterLengths_E_K,
+                                               WeiBlockCopyThreadClusterArrangeOrder,
+                                               WeiBlockCopySrcAccessOrder,
+                                               WeiBlockCopyDstAccessOrder,
+                                               0,
+                                               1,
+                                               WeiBlockCopySrcDataPerRead_E,
+                                               WeiBlockCopyDstDataPerWrite_K>(
+                {0, k_block_data_on_global}, {0, 0});

        // GEMM definition
        // c_mtx += transpose(a_mtx) * b_mtx
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -51,7 +51,7 @@ template <index_t GridSize,
          index_t WeiBlockCopyDstDataPerWrite_K>
 struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
 {
-#if 1
+#if 0
    __device__ void Run(const Float* const __restrict__ p_in_global,
                        const Float* const __restrict__ p_wei_global,
                        Float* const __restrict__ p_out_global) const
@@ -437,6 +437,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
                      "wrong! aligment requirement for vectorized global load of input tensor will "
                      "be violated");

+        // input
        constexpr auto in_n_c_hi_wi_global_desc =
            make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());

@@ -465,6 +466,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
            make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));

+        // weight
+        constexpr auto wei_e_k_global_desc =
+            transform_tensor_descriptor(wei_k_c_y_x_global_desc,
+                                        make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
+                                        make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
 #if 0
        if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
        {
@@ -487,8 +495,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
            print_array("idx1: ", idx1);
            print_array("idx0: ", idx0);
        }
+#else
+        index_t itmp    = get_block_1d_id() + get_thread_local_1d_id();
+        auto wei_coord1 = make_tensor_coordinate_v2(wei_e_k_global_desc, {itmp, itmp + 1});
+
+        auto step_sizes = make_multi_index(EPerBlock, 0);
+
+        wei_coord1 += step_sizes;
+
+        p_out_global[0] = wei_coord1.GetLowerCoordinate().GetIndex()[0];
+        p_out_global[1] = wei_coord1.GetLowerCoordinate().GetIndex()[1];
+        p_out_global[2] = wei_coord1.GetLowerCoordinate().GetIndex()[2];
+        p_out_global[3] = wei_coord1.GetLowerCoordinate().GetIndex()[3];
 #endif
-        p_out_global[0] = in_e_n1_b_n2_global_desc.CalculateOffset({0, 0, 10, 0});
    }
 #endif
 };
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -197,7 +197,7 @@ struct Merge

            // do carry check in reversed order, starting from lowest dimension
            // don't check the highest dimension
-            static_for<0, nDimLow, 1>{}([&](auto ireverse) {
+            static_for<0, nDimLow - 1, 1>{}([&](auto ireverse) {
                constexpr index_t i = nDimLow - 1 - ireverse;

                if(carry)
@@ -213,6 +213,12 @@ struct Merge
                    carry = true;
                }
            });
+
+            // highest dimension, no out-of-bound check
+            if(carry)
+            {
+                ++idx_low_new(0);
+            }
        }
        else if(idx_up_diff[0] < 0)
        {
@@ -220,7 +226,7 @@ struct Merge

            // do borrow check in reversed order, starting from lowest dimension
            // don't check the highest dimension
-            static_for<0, nDimLow, 1>{}([&](auto ireverse) {
+            static_for<0, nDimLow - 1, 1>{}([&](auto ireverse) {
                constexpr index_t i = nDimLow - 1 - ireverse;

                if(borrow)
@@ -236,6 +242,12 @@ struct Merge
                    borrow = true;
                }
            });
+
+            // highest dimension, no out-of-bound check
+            if(borrow)
+            {
+                --idx_low_new(0);
+            }
        }

        return idx_low_new - idx_low_old;