experimenting global and buffer load/store

2026-05-12 01:10:17 +00:00 · 2019-09-18 00:15:57 -05:00
parent f58bf38445
commit 9f46cdf5fa
12 changed files with 503 additions and 106 deletions
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -225,7 +225,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
        // c_thread_mtx definition: this is a mess
        // TODO:: more elegent way of defining c_thread_mtx
        constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<N1 * N2>{});
+            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<GemmNRepeat * GemmNPerThreadSubC>{});

        const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
            BlockSize,
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -224,14 +224,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer

        // c_thread_mtx definition: this is a mess
        // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<GemmNRepeat * GemmMPerThreadSubC>{});
+        constexpr auto c_k0k1_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
+            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<GemmNRepeat * GemmNPerThreadSubC>{});

        const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
            BlockSize,
            decltype(a_e_k_block_mtx_desc),
            decltype(b_e_n1bn2_block_mtx_desc),
-            decltype(c_k0k2_n1n2_thread_mtx_desc),
+            decltype(c_k0k1_n1n2_thread_mtx_desc),
            GemmMPerThreadSubC,
            GemmNPerThreadSubC,
            GemmMLevel0Cluster,
@@ -258,12 +258,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
        __shared__ Float p_wei_block_double[2 * wei_block_space];

        // register allocation for output
-        Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];
+        Float p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()];

        // zero out threadwise output
-        threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
-
-        const Float* p_wei_block_on_global = p_wei_global;
+        threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread);

        // LDS double buffer: preload data into LDS
        {
@@ -294,14 +292,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];

                blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
-                p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
+                blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);

                __syncthreads();

                // LDS doubel buffer: load next data from device mem
                blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
-                blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global,
-                                                         p_wei_register_buffer);
+                blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer);

                // LDS double buffer: GEMM on current data
                blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
@@ -319,13 +316,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
            Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];

            blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
-            p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
+            blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);

            __syncthreads();

            // LDS doubel buffer: load next data from device mem
            blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
-            blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer);
+            blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer);

            // LDS double buffer: GEMM on current data
            blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
@@ -347,6 +344,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer

        // copy output: register to global memory
        {
+#if 0
            constexpr index_t K2 = GemmMPerThreadSubC;
            constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster;

@@ -392,17 +390,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, b_thread_data_on_global, 0);

-#if 0
-            ThreadwiseGenericTensorSliceCopy_v1r2<
-                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
-                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
-                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
-                arithmetic_sequence_gen<0, 8, 1>::type,
-                7,
-                1,
-                1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
-                .Run(p_out_thread, p_out_thread_on_global);
-#elif 1
            ThreadwiseGenericTensorSliceCopy_v2r1<
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
@@ -414,6 +401,54 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                1,
                1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
                .Run(p_out_thread, p_out_thread_on_global);
+#else
+            constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster;
+
+            // define tensor descriptor for threadwise copy
+            //     output memory layout descriptor in register, src of threadwise copy
+            constexpr auto out_k0_k1_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor_packed(
+                Sequence<GemmMRepeat, GemmMPerThreadSubC, N1, 1, N2>{});
+
+            //     output memory layout descriptor in device memory
+            constexpr auto out_n0_n1_n2_k0_k1_h_w_global_mem_desc =
+                out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}).Fold(I0, Number<N1>{}, Number<N2>{});
+
+            //     output merged global tensor descriptor, dst of threadwise copy
+            constexpr auto out_k0_k1_n1_b_n2_global_merged_desc =
+                make_ConstantMergedTensorDescriptor(out_n0_n1_n2_k0_k1_h_w_global_mem_desc,
+                                                    Sequence<3>{},
+                                                    Sequence<4>{},
+                                                    Sequence<1>{},
+                                                    Sequence<0, 5, 6>{},
+                                                    Sequence<2>{});
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+            const index_t k_thread_data_on_global =
+                k_block_data_on_global + c_thread_mtx_on_block.row;
+
+            const index_t b_thread_data_on_global =
+                b_block_data_on_global + c_thread_mtx_on_block.col / N2;
+
+            ThreadwiseGenericTensorSliceCopy_v2r1<
+                decltype(out_k0_k1_n1_b_n2_thread_mem_desc),
+                decltype(out_k0_k1_n1_b_n2_global_merged_desc),
+                decltype(out_k0_k1_n1_b_n2_thread_mem_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 5, 1>::type,
+                arithmetic_sequence_gen<0, 5, 1>::type,
+                3,
+                3,
+                1,
+                1>({0, 0, 0, 0, 0},
+                   {k_thread_data_on_global / K1,
+                    k_thread_data_on_global % K1,
+                    0,
+                    b_thread_data_on_global,
+                    0})
+                .template Run_amd_experiment<Float, 0, 2>(p_out_thread, p_out_global);
 #endif
        }
    }
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -313,14 +313,14 @@ struct TensorCoordinate
    private:
    template <class... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
    {
        return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
    }

    template <class... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
    {
        return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
    }
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
    private:
    template <typename... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
    {
        return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -196,7 +196,7 @@ struct TensorCoordinate_v2

    template <typename... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
    {
        return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -439,11 +439,10 @@ struct BlockwiseGenericTensorSliceCopy_v2
 {
    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();

-    using SrcCoordinate = typename TensorCoordinate<SrcDesc>::type;
-    using DstCoordinate = typename TensorCoordinate<DstDesc>::type;
+    using Index = MultiIndex<nDim>;

-    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin,
-                                                            DstCoordinate dst_block_slice_origin)
+    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin,
+                                                            const Index& dst_block_slice_origin)
    {
        static_assert(
            nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
@@ -485,13 +484,21 @@ struct BlockwiseGenericTensorSliceCopy_v2
    template <typename TData>
    __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
    {
+#if 0
        mThreadwiseLoad.Run(p_src, p_buffer);
+#else
+        mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
+#endif
    }

    template <typename TData>
    __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
    {
+#if 0
        mThreadwiseStore.Run(p_buffer, p_dst);
+#else
+        mThreadwiseStore.template Run_amd_experiment<TData, 0, 2>(p_buffer, p_dst);
+#endif
    }

    template <typename TData>
@@ -499,8 +506,13 @@ struct BlockwiseGenericTensorSliceCopy_v2
    {
        TData p_buffer[GetRegisterBufferSize()];

+#if 0
        mThreadwiseLoad.Run(p_src, p_buffer);
        mThreadwiseStore.Run(p_buffer, p_dst);
+#else
+        mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
+        mThreadwiseStore.template Run_amd_experiment<TData, 0, 2>(p_buffer, p_dst);
+#endif
    }

    template <typename T, bool PositiveDirection>
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -452,11 +452,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
 {
    static constexpr index_t nDim = SliceLengths::GetSize();

+    using Index = MultiIndex<nDim>;
+
    using SrcCoordinate = typename TensorCoordinate<SrcDesc>::type;
    using DstCoordinate = typename TensorCoordinate<DstDesc>::type;

-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin,
-                                                               DstCoordinate dst_slice_origin)
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(const Index& src_slice_origin,
+                                                               const Index& dst_slice_origin)
        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
    {
        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
@@ -755,6 +757,211 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
        }
    }

+    // memory-space
+    // 0: VGPR
+    // 1: LDS
+    // 2: global-memory
+    template <class TData, index_t SrcMemorySpace, index_t DstMemorySpace>
+    __device__ void Run_amd_experiment(const TData* p_src, TData* p_dst) const
+    {
+        constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
+
+        TData p_buffer_[buffer_desc.GetElementSpace()];
+        TData* p_buffer = p_buffer_;
+
+        // copy data from src into buffer
+        {
+            using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
+
+            constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
+            constexpr auto src_data_per_access   = Number<SrcDataPerAccess>{};
+
+            constexpr auto src_access_lengths = SliceLengths::Modify(
+                src_vector_access_dim,
+                SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
+
+            // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
+            // normal dimensions is known at compile time.
+            // Below is a hack to isolate merged dimension id from normal dimension id, so the
+            // corresponding offset can be calculated seperately at run-time and compile-time.
+            // src_merged_dim_access_lengths has the same value as src_access_lengths on src's
+            // merged dimensions, and has value = 1 on normal dimensions;
+            // src_merged_dim_access_lengths has the same value as src_access_lengths on src's
+            // normal dimensions, and has value = 1 on merged dimensions;
+            constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
+                nDim,
+                IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
+
+            constexpr auto src_normal_dim_access_lengths =
+                src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
+
+            ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
+                auto src_merged_dim_access_id) {
+
+                auto src_merged_dim_data_id = src_merged_dim_access_id;
+                src_merged_dim_data_id(src_vector_access_dim) =
+                    src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
+
+                // offset w.r.t. merged dimension need be computed at run-time,
+                const index_t src_merged_offset =
+                    (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
+
+                ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
+                    auto src_normal_dim_access_id) {
+
+                    auto src_normal_dim_data_id = src_normal_dim_access_id;
+                    src_normal_dim_data_id(src_vector_access_dim) =
+                        src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
+
+                    // offset w.r.t. normal dimension is known at compile-time
+                    const index_t src_normal_offset =
+                        SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
+
+                    src_vector_t vector_data;
+
+                    static_if<SrcMemorySpace == 2>{}([&](auto) {
+#if 1 // source code
+                        // Load vector from src.
+                        // src can be all kinds of memory-space.
+                        // In order for optimized global_load to be emitted by compiler, need to
+                        // assume:
+                        //   1. p_src need to be block-invariant (assumption)
+                        //   2. src_normal_offset must be calculatd at compile time (guaranteed)
+                        //   3. src_merged_offset can be runtime value (no assumption imposed)
+                        vector_data = *reinterpret_cast<const src_vector_t*>(
+                            &p_src[src_normal_offset + src_merged_offset]);
+#else // inline asm using buffer_load
+                        // Load vector from src
+                        // src's memory-space can only be global-memory (buffer_load inline-asm is
+                        // used)
+                        // In order for buffer_load to be valid, need to assume:
+                        //   1. p_src need to be block-invariant (assumption)
+                        //   2. src_normal_offset must be calculatd at compile time (guaranteed)
+                        //   3. src_merged_offset can be runtime value (no assumption imposed)
+                        vector_data = buffer_load<TData, SrcDataPerAccess>(
+                            p_src,
+                            static_cast<uint32_t>(src_merged_offset),
+                            static_cast<uint32_t>(src_normal_offset));
+#endif
+                    }).Else([&](auto) {
+                        // Load vector from src.
+                        // src can be all kinds of memory-space.
+                        // In order for optimized global_load to be emitted by compiler, need to
+                        // assume:
+                        //   1. p_src need to be block-invariant (assumption)
+                        //   2. src_normal_offset must be calculatd at compile time (guaranteed)
+                        //   3. src_merged_offset can be runtime value (no assumption imposed)
+                        vector_data = *reinterpret_cast<const src_vector_t*>(
+                            &p_src[src_normal_offset + src_merged_offset]);
+                    });
+
+                    // unpack vector into buffer
+                    for(index_t i = 0; i < SrcDataPerAccess; ++i)
+                    {
+                        auto scalar_id                   = make_zero_array<index_t, nDim>();
+                        scalar_id(src_vector_access_dim) = i;
+
+                        const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
+                            src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
+
+                        p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
+                    }
+                });
+            });
+        }
+
+        // copy data from buffer into dst
+        {
+            using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
+
+            constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
+            constexpr auto dst_data_per_access   = Number<DstDataPerAccess>{};
+
+            constexpr auto dst_access_lengths = SliceLengths::Modify(
+                dst_vector_access_dim,
+                SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
+
+            constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
+                nDim,
+                IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
+
+            constexpr auto dst_normal_dim_access_lengths =
+                dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
+
+            ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}(
+                [&](auto dst_merged_dim_access_id) {
+
+                    auto dst_merged_dim_data_id = dst_merged_dim_access_id;
+                    dst_merged_dim_data_id(dst_vector_access_dim) =
+                        dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
+
+                    // offset w.r.t. merged dimension need be computed at run-time,
+                    const index_t dst_merged_offset =
+                        (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
+
+                    ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
+                        auto dst_normal_dim_access_id) {
+
+                        auto dst_normal_dim_data_id = dst_normal_dim_access_id;
+                        dst_normal_dim_data_id(dst_vector_access_dim) =
+                            dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
+
+                        dst_vector_t vector_data;
+
+                        // pack vector from buffer
+                        for(index_t i = 0; i < DstDataPerAccess; ++i)
+                        {
+                            auto scalar_id                   = make_zero_array<index_t, nDim>();
+                            scalar_id(dst_vector_access_dim) = i;
+
+                            const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
+                                dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
+
+                            reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
+                        }
+
+                        // offset w.r.t. normal dimension is known at compile-time
+                        const index_t dst_normal_offset =
+                            DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
+
+                        static_if<DstMemorySpace == 2>{}([&](auto) {
+#if 1 // source code
+                            // Write vector into dst.
+                            // dst can be all kinds of memory-space
+                            // In order for optmized global_store to be emitted by compiler, need to
+                            // assume:
+                            //   1. p_dst need to be block-invariant (assumption)
+                            //   2. dst_normal_offset must be calculatd at compile time (guaranteed)
+                            //   3. dst_merged_offset can be runtime value (no assumption imposed)
+                            *reinterpret_cast<dst_vector_t*>(
+                                &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
+#else // inline asm using buffer_store
+                            // Write vector into dst.
+                            // dst's memory-space need to be global-memory (buffer_store is used)
+                            // In order for optmized global_store to be emitted by compiler, need to
+                            // assume:
+                            //   1. p_dst need to be block-invariant (assumption)
+                            //   2. dst_normal_offset must be calculatd at compile time (guaranteed)
+                            //   3. dst_merged_offset can be runtime value (no assumption imposed)
+                            buffer_store<TData, DstDataPerAccess>(
+                                vector_data, p_dst, dst_merged_offset, dst_normal_offset);
+#endif
+                        }).Else([&](auto) {
+                            // Write vector into dst.
+                            // dst can be all kinds of memory-space
+                            // In order for optmized global_store to be emitted by compiler, need to
+                            // assume:
+                            //   1. p_dst need to be block-invariant (assumption)
+                            //   2. dst_normal_offset must be calculatd at compile time (guaranteed)
+                            //   3. dst_merged_offset can be runtime value (no assumption imposed)
+                            *reinterpret_cast<dst_vector_t*>(
+                                &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
+                        });
+                    });
+                });
+        }
+    }
+
    // T can be Sequence or Array
    template <class T, bool PositiveDirection>
    __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -8,6 +8,114 @@ namespace ck {
 // cast a pointer of LDS to its address
 extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);

+// buffer_load and buffer_store
+template <typename T, index_t VectorSize>
+__device__ typename vector_type<T, VectorSize>::MemoryType
+buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
+
+template <typename T, index_t VectorSize>
+__device__ void buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
+                             T* p_dst_block,
+                             uint32_t dst_thread_offset,
+                             uint32_t dst_const_offset);
+
+template <>
+__device__ float buffer_load<float, 1>(const float* p_src_block,
+                                       uint32_t src_thread_offset,
+                                       uint32_t src_const_offset)
+{
+    float dst;
+
+    int32x4_t src_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+
+    asm volatile("\n \
+    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
+
+    return dst;
+}
+
+template <>
+__device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float* p_src_block,
+                                                                   uint32_t src_thread_offset,
+                                                                   uint32_t src_const_offset)
+{
+    vector_type<float, 2>::MemoryType dst;
+
+    int32x4_t src_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+
+    asm volatile("\n \
+    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
+
+    return dst;
+}
+
+template <>
+__device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float* p_src_block,
+                                                                   uint32_t src_thread_offset,
+                                                                   uint32_t src_const_offset)
+{
+    vector_type<float, 4>::MemoryType dst;
+
+    int32x4_t src_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+
+    asm volatile("\n \
+    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
+
+    return dst;
+}
+
+template <>
+__device__ void buffer_store<float, 1>(const float& src,
+                                       float* p_dst_block,
+                                       uint32_t dst_thread_offset,
+                                       uint32_t dst_const_offset)
+{
+    int32x4_t dst_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    // fill in byte 2
+    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+
+    asm volatile("\n \
+    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+                 :
+                 : "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset));
+}
+
 __device__ void vmcnt(index_t cnt)
 {
    if(cnt == 0)
--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
@@ -6,7 +6,7 @@
 namespace ck {

 template <index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<unsigned_t, NSize> a)
+__host__ __device__ void print_array(const char* s, Array<uint32_t, NSize> a)
 {
    constexpr index_t nsize = a.GetSize();

@@ -90,7 +90,7 @@ __host__ __device__ void print_array(const char* s, Array<unsigned_t, NSize> a)
 }

 template <index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<signed_t, NSize> a)
+__host__ __device__ void print_array(const char* s, Array<int32_t, NSize> a)
 {
    constexpr index_t nsize = a.GetSize();

--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -5,6 +5,7 @@
 #include "hip/hip_fp16.h"

 #define CK_DEVICE_BACKEND_AMD 1
+#define CK_USE_UNSIGNED_INDEX_TYPE 1
 #define CK_USE_AMD_INLINE_ASM 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
@@ -13,13 +14,10 @@

 namespace ck {

-using unsigned_t = uint32_t;
-using signed_t   = int;
-
-#if 0 // debug
-using index_t = unsigned_t;
+#if CK_USE_UNSIGNED_INDEX_TYPE
+using index_t = uint32_t;
 #else
-using index_t = signed_t;
+using index_t = int32_t;
 #endif

 // For some reason, HIP compiler need this definition to generate optimal load and store
@@ -27,6 +25,8 @@ using index_t = signed_t;
 typedef float float2_t __attribute__((ext_vector_type(2)));
 typedef float float4_t __attribute__((ext_vector_type(4)));

+typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
+
 template <class T>
 __device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
 {
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -7,6 +7,7 @@
 #include "helper_cuda.h"

 #define CK_DEVICE_BACKEND_NVIDIA 1
+#define CK_USE_UNSIGNED_INDEX_TYPE 0
 #define CK_USE_AMD_INLINE_ASM 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
@@ -15,13 +16,10 @@

 namespace ck {

-using unsigned_t = uint32_t;
-using signed_t   = int;
-
-#if 0 // debug
-using index_t = unsigned_t;
+#if CK_USE_UNSIGNED_INDEX_TYPE
+using index_t = uint32_t;
 #else
-using index_t = signed_t;
+using index_t = int32_t;
 #endif

 // For some reason, CUDA need this definition, otherwise
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -48,7 +48,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

 #if 1
-    // each thread hold 64 data
+    // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
    constexpr index_t BlockSize = 256;

    constexpr index_t BPerBlock = 16;
@@ -82,10 +82,47 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]

-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 0
-    // each thread hold 32 data
+    // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t BPerBlock = 8;
+    constexpr index_t KPerBlock = 64;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmNRepeat = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 2, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 1, 8, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 2>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 32>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
    constexpr index_t BlockSize = 256;

    constexpr index_t BPerBlock = 16;
@@ -107,7 +144,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 1, 4>;
    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 16, 1>;
    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
-    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]

    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
@@ -119,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]

-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #endif

@@ -133,50 +170,50 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

+    constexpr auto gridwise_conv =
+#if 0
+        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
+#else
+        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
+#endif
+        <GridSize,
+         BlockSize,
+         T,
+         decltype(in_nchw_desc),
+         decltype(wei_kcyx_desc),
+         decltype(out_nkhw_desc),
+         ConvStrides,
+         ConvDilations,
+         BPerBlock,
+         KPerBlock,
+         EPerBlock,
+         GemmNRepeat,
+         GemmMPerThreadSubC,
+         GemmNPerThreadSubC,
+         GemmMLevel0Cluster,
+         GemmNLevel0Cluster,
+         GemmMLevel1Cluster,
+         GemmNLevel1Cluster,
+         GemmKPerThreadLoop,
+         GemmDataPerReadA,
+         GemmDataPerReadB,
+         InBlockCopySubLengths_E_N1_B_N2,
+         InBlockCopyClusterLengths_E_N1_B_N2,
+         InBlockCopyThreadClusterArrangeOrder,
+         InBlockCopySrcAccessOrder,
+         InBlockCopyDstAccessOrder,
+         InBlockCopySrcDataPerRead_B,
+         InBlockCopyDstDataPerWrite_N2,
+         WeiBlockCopySubLengths_E_K,
+         WeiBlockCopyClusterLengths_E_K,
+         WeiBlockCopyThreadClusterArrangeOrder,
+         WeiBlockCopySrcAccessOrder,
+         WeiBlockCopyDstAccessOrder,
+         WeiBlockCopySrcDataPerRead_E,
+         WeiBlockCopyDstDataPerWrite_K>{};
+
    for(index_t i = 0; i < nrepeat; ++i)
    {
-        constexpr auto gridwise_conv =
-#if 0
-            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
-#else
-            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
-#endif
-            <GridSize,
-             BlockSize,
-             T,
-             decltype(in_nchw_desc),
-             decltype(wei_kcyx_desc),
-             decltype(out_nkhw_desc),
-             ConvStrides,
-             ConvDilations,
-             BPerBlock,
-             KPerBlock,
-             EPerBlock,
-             GemmNRepeat,
-             GemmMPerThreadSubC,
-             GemmNPerThreadSubC,
-             GemmMLevel0Cluster,
-             GemmNLevel0Cluster,
-             GemmMLevel1Cluster,
-             GemmNLevel1Cluster,
-             GemmKPerThreadLoop,
-             GemmDataPerReadA,
-             GemmDataPerReadB,
-             InBlockCopySubLengths_E_N1_B_N2,
-             InBlockCopyClusterLengths_E_N1_B_N2,
-             InBlockCopyThreadClusterArrangeOrder,
-             InBlockCopySrcAccessOrder,
-             InBlockCopyDstAccessOrder,
-             InBlockCopySrcDataPerRead_B,
-             InBlockCopyDstDataPerWrite_N2,
-             WeiBlockCopySubLengths_E_K,
-             WeiBlockCopyClusterLengths_E_K,
-             WeiBlockCopyThreadClusterArrangeOrder,
-             WeiBlockCopySrcAccessOrder,
-             WeiBlockCopyDstAccessOrder,
-             WeiBlockCopySrcDataPerRead_E,
-             WeiBlockCopyDstDataPerWrite_K>{};
-
        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -75,13 +75,13 @@ int main(int argc, char* argv[])
    using namespace ck;

 #if 0
-    constexpr index_t N  = 8;
-    constexpr index_t C  = 8;
-    constexpr index_t HI = 2;
-    constexpr index_t WI = 8;
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 16;
+    constexpr index_t HI = 34;
+    constexpr index_t WI = 34;
    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;
@@ -92,8 +92,8 @@ int main(int argc, char* argv[])
    // 3x3, 34x34
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;
-    constexpr index_t HI = 32;
-    constexpr index_t WI = 32;
+    constexpr index_t HI = 34;
+    constexpr index_t WI = 34;
    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
@@ -101,8 +101,8 @@ int main(int argc, char* argv[])
    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
 #elif 0
    // 1x1 filter, 8x8 image
    // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
 #elif 0
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
+#elif 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,