From 00899f191b1ffbe24bdaf982d8ebf6a4e29697c3 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Sat, 13 Apr 2019 11:19:17 -0500
Subject: [PATCH] implicit gemm v1r2: only load 1d filter

---
 ...icit_gemm_convolution_1_chwn_cyxk_khwn.hpp |  48 ++++-
 driver/driver.hip.cpp                         |  43 +++--
 src/include/ConstantTensorDescriptor.hip.hpp  |  32 ++++
 src/include/blockwise_2d_tensor_op.hip.hpp    | 101 +++++++++-
 src/include/blockwise_3d_tensor_op.hip.hpp    | 103 ++++++++++
 src/include/blockwise_4d_tensor_op.hip.hpp    |  18 +-
 src/include/blockwise_batched_gemm.hip.hpp    |  11 ++
 .../blockwise_direct_convolution.hip.hpp      |   4 +-
 src/include/blockwise_gemm.hip.hpp            |   2 +
 ...implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp} |  14 +-
 ..._chwn_cyxk_khwn_lds_double_buffer.hip.hpp} |   2 +-
 ..._implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp | 176 +++++++++---------
 ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp |   2 +-
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp |   2 +-
 src/include/threadwise_2d_tensor_op.hip.hpp   |   4 +-
 src/include/threadwise_4d_tensor_op.hip.hpp   |   4 +-
 .../threadwise_direct_convolution.hip.hpp     |   2 +-
 17 files changed, 426 insertions(+), 142 deletions(-)
 create mode 100644 src/include/blockwise_3d_tensor_op.hip.hpp
 rename src/include/{gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp => gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp} (96%)
 rename src/include/{gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp => gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp} (99%)
diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
index decf294ab4..8727f7a315 100644
--- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
@@ -2,8 +2,8 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "gridwise_convolution_wrapper.hip.hpp"
-#include "gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp"
-#include "gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp"
 
 template <class T, class InDesc, class WeiDesc, class OutDesc>
@@ -78,7 +78,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
     out_khwn_device_buf.ToDevice(out_khwn.mData.data());
 
 #if 0
-    // for 3x3, 34x34, Pascal
+    // for 3x3, 34x34, v1r1, Pascal
     constexpr index_t NPerBlock  = 16;
     constexpr index_t KPerBlock  = 64;
     constexpr index_t CPerBlock  = 4;
@@ -112,6 +112,40 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
 
     constexpr index_t BlockSize = 128;
 #elif 1
+    // for 3x3, 34x34, v1r2, Pascal
+    constexpr index_t NPerBlock  = 4;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 8;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t InBlockCopy_ThreadPerDimC = 4;
+    constexpr index_t InBlockCopy_ThreadPerDimH = 4;
+    constexpr index_t InBlockCopy_ThreadPerDimW = 2;
+    constexpr index_t InBlockCopy_ThreadPerDimN = 1;
+    constexpr index_t InBlockCopyDataPerRead    = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite = 2;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
     // for 3x3, 34x34, Vega 20
     constexpr index_t NPerBlock  = 16;
     constexpr index_t KPerBlock  = 128;
@@ -406,12 +440,12 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
     for(index_t i = 0; i < nrepeat; ++i)
     {
         constexpr auto gridwise_conv =
-#if 1
-            GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
+#if 0
+            GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
+#elif 0
+            GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn_lds_double_buffer
 #elif 1
             GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
-#elif 0
-            GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer
 #endif
             <GridSize,
              BlockSize,
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index 54b8cb1982..2eb690d765 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -35,6 +35,29 @@ struct GeneratorTensor_2
     }
 };
 
+struct GeneratorTensor_3
+{
+    int min_value = 0;
+    int max_value = 9;
+
+    template <class... Is>
+    double operator()(Is... is)
+    {
+        std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
+
+#if 0
+        auto f_acc = std::plus<index_t>{};
+#else
+        auto f_acc = [](auto a, auto b){ return 10*a + b;};
+#endif
+
+        return std::accumulate(dims.begin(),
+                               dims.end(),
+                               index_t(0),
+                               f_acc);
+    }
+};
+
 struct GeneratorTensor_Checkboard
 {
     template <class... Ts>
@@ -398,18 +421,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
 
 int main(int argc, char* argv[])
 {
-#if 0
-    constexpr index_t N  = 1;
-    constexpr index_t C  = 1;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 1;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 1
+#if 1
     // 3x3, 34x34
     constexpr index_t N  = 64;
     constexpr index_t C  = 256;
@@ -656,13 +668,10 @@ int main(int argc, char* argv[])
 #if 0
         in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
         wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#elif 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 1
         in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
         wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-#elif 0
+#elif 1
         in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
 
         auto gen_wei = [](auto... is) {
@@ -681,7 +690,7 @@ int main(int argc, char* argv[])
     device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
 #elif 1
     device_implicit_gemm_convolution_1_chwn_cyxk_khwn
-#elif 1
+#elif 0
     device_implicit_gemm_convolution_2_chwn_cyxk_khwn
 #endif
     (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp
index f4d95dfccb..c8b621d384 100644
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -8,6 +8,13 @@ __host__ __device__ constexpr auto calculate_default_strides(Sequence<L0, L1>)
     return Sequence<L1, 1>{};
 }
 
+// this is ugly, only for 3d
+template <index_t L0, index_t L1, index_t L2>
+__host__ __device__ constexpr auto calculate_default_strides(Sequence<L0, L1, L2>)
+{
+    return Sequence<L1 * L2, L2, 1>{};
+}
+
 // this is ugly, only for 4d
 template <index_t L0, index_t L1, index_t L2, index_t L3>
 __host__ __device__ constexpr auto calculate_default_strides(Sequence<L0, L1, L2, L3>)
@@ -79,6 +86,15 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
     return Sequence<L1_align, 1>{};
 }
 
+// this is ugly, only for 3d
+template <index_t L0, index_t L1, index_t L2, index_t Align>
+__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0, L1, L2>,
+                                                                     Number<Align>)
+{
+    constexpr index_t L2_align = Align * ((L2 + Align - 1) / Align);
+    return Sequence<L1 * L2_align, L2_align, 1>{};
+}
+
 // this is ugly, only for 4d
 template <index_t L0, index_t L1, index_t L2, index_t L3, index_t Align>
 __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0, L1, L2, L3>,
@@ -244,6 +260,22 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
                desc.GetStride(I0),
                desc.GetStride(I1));
     }
+    else if(ndim == 3)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+
+        printf("%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n",
+               s,
+               desc.GetDimension(),
+               desc.GetLength(I0),
+               desc.GetLength(I1),
+               desc.GetLength(I2),
+               desc.GetStride(I0),
+               desc.GetStride(I1),
+               desc.GetStride(I2));
+    }
     else if(ndim == 4)
     {
         constexpr auto I0 = Number<0>{};
diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp
index 141cb2602f..6521ddc0a5 100644
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -14,7 +14,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
     constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
 
 #if 0
-    if(threadIdx.x == 0)
+    if(get_thread_local_1d_id() == 0)
     {
         print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
         print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
@@ -25,7 +25,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
 
     for(index_t iloop = 0; iloop < NLoop; ++iloop)
     {
-        index_t is = threadIdx.x + iloop * BlockSize;
+        index_t is = get_thread_local_1d_id() + iloop * BlockSize;
 
         const index_t did0 = is / desc.GetStride(I0);
 
@@ -42,7 +42,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
 
     if(has_tail)
     {
-        index_t is = threadIdx.x + NLoop * BlockSize;
+        index_t is = get_thread_local_1d_id() + NLoop * BlockSize;
 
         if(is < desc.GetElementSize())
         {
@@ -59,7 +59,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
     }
 }
 
-// Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3]
+// Function: p_dst[reorder[i0], reorder[i1] = p_src[i0,i1]
 // TODO: in order to optimize mem access for different mem type,
 // need to write specialized version
 template <index_t BlockSize,
@@ -92,7 +92,7 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
 
     for(index_t iloop = 0; iloop < NLoop; ++iloop)
     {
-        index_t is = threadIdx.x + iloop * BlockSize;
+        index_t is = get_thread_local_1d_id() + iloop * BlockSize;
 
         index_t did[2];
 
@@ -113,7 +113,7 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
 
     if(has_tail)
     {
-        index_t is = threadIdx.x + NLoop * BlockSize;
+        index_t is = get_thread_local_1d_id() + NLoop * BlockSize;
 
         if(is < ref_desc.GetElementSize())
         {
@@ -162,15 +162,96 @@ blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
         SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
 }
 
-template <index_t BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
+template <index_t BlockSize,
+          class Float,
+          class SrcDesc,
+          class DstDesc,
+          class CopyLengths,
+          index_t DataPerRead>
 struct Blockwise2dTensorCopy1
 {
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
+
+    __device__ constexpr Blockwise2dTensorCopy1()
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        static_assert(DataPerRead == 1 ||
+                          (SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1),
+                      "wrong! only support stride1 == 1 if DataPerRead > 1!\n");
+
+        static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
+                      "wrong! only support DataPerRead == 1, 2 or 4!\n");
+
+        static_assert(SrcDesc{}.GetStride(I0) % DataPerRead == 0 &&
+                          DstDesc{}.GetStride(I0) % DataPerRead == 0,
+                      "src and dst stride2 should be multiple of DataPerRead to keep alignment");
+
+        // we allow out-of-bound read from src in D1 dimension,
+        //   but we need to make sure dst stride0 is big enough,
+        //   so that the out-of-bound write won't contaminate next line in dst
+        constexpr index_t L1          = CopyLengths{}.Get(I1);
+        constexpr index_t read_per_d1 = integer_divide_ceil(L1, DataPerRead);
+
+        static_assert(read_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0),
+                      "wrong! out-of-bound write will contaminate next line!\n");
+    }
+
     __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
-        constexpr auto dst_from_src_reorder = Sequence<0, 1>{};
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
 
-        blockwise_2d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
-            SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
+        constexpr auto src_desc = SrcDesc{};
+        constexpr auto dst_desc = DstDesc{};
+
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);
+
+        constexpr index_t read_per_d1 = integer_divide_ceil(L1, DataPerRead);
+
+        constexpr auto ref_desc =
+            make_ConstantTensorDescriptor(Sequence<L0, read_per_d1>{});
+
+        constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
+
+        auto f_copy = [&](index_t is) {
+            index_t did[4];
+
+            did[0] = is / ref_desc.GetStride(I0);
+
+            is -= did[0] * ref_desc.GetStride(I0);
+
+            did[1] = is / ref_desc.GetStride(I1);
+
+            const index_t src_index =
+                src_desc.Get1dIndex(did[0], did[1] * DataPerRead);
+            const index_t dst_index =
+                dst_desc.Get1dIndex(did[0], did[1] * DataPerRead);
+
+            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
+                *(reinterpret_cast<const vector_t*>(p_src + src_index));
+        };
+
+        for(index_t iloop = 0; iloop < NLoop; ++iloop)
+        {
+            index_t is = get_thread_local_1d_id() + iloop * BlockSize;
+
+            f_copy(is);
+        }
+
+        constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
+
+        if(has_tail)
+        {
+            index_t is = get_thread_local_1d_id() + NLoop * BlockSize;
+
+            if(is < ref_desc.GetElementSize())
+            {
+                f_copy(is);
+            }
+        }
     }
 };
 
diff --git a/src/include/blockwise_3d_tensor_op.hip.hpp b/src/include/blockwise_3d_tensor_op.hip.hpp
new file mode 100644
index 0000000000..c7a85431b2
--- /dev/null
+++ b/src/include/blockwise_3d_tensor_op.hip.hpp
@@ -0,0 +1,103 @@
+#pragma once
+#include "common.hip.hpp"
+#include "ConstantTensorDescriptor.hip.hpp"
+
+template <index_t BlockSize,
+          class Float,
+          class SrcDesc,
+          class DstDesc,
+          class CopyLengths,
+          index_t DataPerRead>
+struct Blockwise3dTensorCopy1
+{
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
+
+    __device__ constexpr Blockwise3dTensorCopy1()
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+
+        static_assert(DataPerRead == 1 ||
+                          (SrcDesc{}.GetStride(I2) == 1 && DstDesc{}.GetStride(I2) == 1),
+                      "wrong! only support stride2 == 1 if DataPerRead > 1!\n");
+
+        static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
+                      "wrong! only support DataPerRead == 1, 2 or 4!\n");
+
+        static_assert(SrcDesc{}.GetStride(I1) % DataPerRead == 0 &&
+                          DstDesc{}.GetStride(I1) % DataPerRead == 0,
+                      "src and dst stride1 should be multiple of DataPerRead to keep alignment");
+
+        // we allow out-of-bound read from src in D3 dimension,
+        //   but we need to make sure dst stride2 is big enough,
+        //   so that the out-of-bound write won't contaminate next line in dst
+        constexpr index_t L2          = CopyLengths{}.Get(I2);
+        constexpr index_t read_per_d2 = integer_divide_ceil(L2, DataPerRead);
+
+        static_assert(read_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
+                      "wrong! out-of-bound write will contaminate next line!\n");
+    }
+
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+
+        constexpr auto src_desc = SrcDesc{};
+        constexpr auto dst_desc = DstDesc{};
+
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);
+        constexpr index_t L2 = CopyLengths{}.Get(I2);
+
+        constexpr index_t read_per_d2 = integer_divide_ceil(L2, DataPerRead);
+
+        constexpr auto ref_desc =
+            make_ConstantTensorDescriptor(Sequence<L0, L1, read_per_d2>{});
+
+        constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
+
+        auto f_copy = [&](index_t is) {
+            index_t did[3];
+
+            did[0] = is / ref_desc.GetStride(I0);
+
+            is -= did[0] * ref_desc.GetStride(I0);
+
+            did[1] = is / ref_desc.GetStride(I1);
+
+            is -= did[1] * ref_desc.GetStride(I1);
+
+            did[2] = is / ref_desc.GetStride(I2);
+
+            const index_t src_index =
+                src_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead);
+            const index_t dst_index =
+                dst_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead);
+
+            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
+                *(reinterpret_cast<const vector_t*>(p_src + src_index));
+        };
+
+        for(index_t iloop = 0; iloop < NLoop; ++iloop)
+        {
+            index_t is = get_thread_local_1d_id() + iloop * BlockSize;
+
+            f_copy(is);
+        }
+
+        constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
+
+        if(has_tail)
+        {
+            index_t is = get_thread_local_1d_id() + NLoop * BlockSize;
+
+            if(is < ref_desc.GetElementSize())
+            {
+                f_copy(is);
+            }
+        }
+    }
+};
diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp
index 444190e2a7..45de7823b5 100644
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -15,7 +15,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
     constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
 
 #if 0
-    if(threadIdx.x == 0)
+    if(get_thread_local_1d_id() == 0)
     {
         print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
         print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
@@ -26,7 +26,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
 
     for(index_t iloop = 0; iloop < NLoop; ++iloop)
     {
-        index_t is = threadIdx.x + iloop * BlockSize;
+        index_t is = get_thread_local_1d_id() + iloop * BlockSize;
 
         const index_t did0 = is / desc.GetStride(I0);
 
@@ -51,7 +51,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
 
     if(has_tail)
     {
-        index_t is = threadIdx.x + NLoop * BlockSize;
+        index_t is = get_thread_local_1d_id() + NLoop * BlockSize;
 
         if(is < desc.GetElementSize())
         {
@@ -113,7 +113,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
 
     for(index_t iloop = 0; iloop < NLoop; ++iloop)
     {
-        index_t is = threadIdx.x + iloop * BlockSize;
+        index_t is = get_thread_local_1d_id() + iloop * BlockSize;
 
         index_t did[4];
 
@@ -142,7 +142,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
 
     if(has_tail)
     {
-        index_t is = threadIdx.x + NLoop * BlockSize;
+        index_t is = get_thread_local_1d_id() + NLoop * BlockSize;
 
         if(is < ref_desc.GetElementSize())
         {
@@ -287,7 +287,7 @@ struct Blockwise4dTensorCopy1
 
         for(index_t iloop = 0; iloop < NLoop; ++iloop)
         {
-            index_t is = threadIdx.x + iloop * BlockSize;
+            index_t is = get_thread_local_1d_id() + iloop * BlockSize;
 
             f_copy(is);
         }
@@ -296,7 +296,7 @@ struct Blockwise4dTensorCopy1
 
         if(has_tail)
         {
-            index_t is = threadIdx.x + NLoop * BlockSize;
+            index_t is = get_thread_local_1d_id() + NLoop * BlockSize;
 
             if(is < ref_desc.GetElementSize())
             {
@@ -370,7 +370,7 @@ struct BlockwiseChwnTensorCopyPadded
 
         for(index_t iloop = 0; iloop < NLoop; ++iloop)
         {
-            index_t is = threadIdx.x + iloop * BlockSize;
+            index_t is = get_thread_local_1d_id() + iloop * BlockSize;
 
             index_t did[4];
 
@@ -401,7 +401,7 @@ struct BlockwiseChwnTensorCopyPadded
 
         if(has_tail)
         {
-            index_t is = threadIdx.x + NLoop * BlockSize;
+            index_t is = get_thread_local_1d_id() + NLoop * BlockSize;
 
             if(is < ref_desc.GetElementSize())
             {
diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp
index 6919325f98..bdaab2e90a 100644
--- a/src/include/blockwise_batched_gemm.hip.hpp
+++ b/src/include/blockwise_batched_gemm.hip.hpp
@@ -250,6 +250,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                     }
                 }
 
+#if 0
+                if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+                {
+                        printf("a: %f %f %f %f %f %f %f %f, b: %f %f %f %f %f %f %f %f\n",
+                               p_a_thread[0], p_a_thread[1], p_a_thread[2], p_a_thread[3], p_a_thread[4], p_a_thread[5], p_a_thread[6], p_a_thread[7],
+                               p_b_thread[0], p_b_thread[1], p_b_thread[2], p_b_thread[3], p_b_thread[4], p_b_thread[5], p_b_thread[6], p_b_thread[7]);
+                }
+#endif
+
                 threadwise_gemm(a_thread_mtx,
                                 True,
                                 p_a_thread,
@@ -313,6 +322,8 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                           MPerThread == 8 && NPerThread == 8,
                       "Run_asm cannot deal with this GEMM shape yet\n");
 
+        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
+
         static_assert(
             BlockMatrixStrideA == 0 && BatchPerThread == 1,
             "Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n");
diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp
index 3aff3b7936..c79833f17d 100644
--- a/src/include/blockwise_direct_convolution.hip.hpp
+++ b/src/include/blockwise_direct_convolution.hip.hpp
@@ -42,7 +42,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
     constexpr index_t XThreadWork = (out_block_desc.GetLength(I3) + WoPerThread - 1) / WoPerThread;
 
 #if 0
-    if(threadIdx.x == 0)
+    if(get_thread_local_1d_id() == 0)
     {
         print_ConstantTensorDescriptor(in_block_desc);
         print_ConstantTensorDescriptor(wei_block_desc);
@@ -68,7 +68,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
     constexpr auto out_thread_block_desc =
         make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_block_desc.GetStrides());
 
-    const index_t thread_id = threadIdx.x;
+    const index_t thread_id = get_thread_local_1d_id();
 
     for(index_t thread_work_id = thread_id;
         thread_work_id < NThreadWork * KThreadWork * YThreadWork * XThreadWork;
diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index ad4e9d2cdf..4870227f7e 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -176,6 +176,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                           MPerThread == 8 && NPerThread == 8,
                       "Run_asm cannot deal with this GEMM shape yet\n");
 
+        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
+
         using Float4 = vector_type<float, 4>::MemoryType;
 
         Float4* reg_a = (Float4*)(p_a_thread);
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
similarity index 96%
rename from src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp
rename to src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
index eab42ed07b..67f1bca25d 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
@@ -36,7 +36,7 @@ template <index_t GridSize,
           index_t InBlockCopyDataPerRead,
           index_t WeiBlockCopyDataPerRead,
           index_t OutThreadCopyDataPerWrite>
-struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
+struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
 {
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
@@ -100,7 +100,7 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
         // tensor view of blockwise input and weight in LDS
         //   be careful of alignment
         constexpr index_t max_align =
-            mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
+            mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead, GemmDataPerReadA, GemmDataPerReadB);
 
         constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor_aligned(
             Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{}, Number<max_align>{});
@@ -118,6 +118,14 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
         // blockwise copy
         // input: format is [C, Hi, Wi, N]
         const auto blockwise_in_copy =
+#if 1
+            Blockwise4dTensorCopy1<BlockSize,
+                                   Float,
+                                   decltype(in_chwn_global_desc),
+                                   decltype(in_chwn_block_desc),
+                                   decltype(in_chwn_block_desc.GetLengths()),
+                                   InBlockCopyDataPerRead>{};
+#else
             Blockwise4dTensorCopy3<BlockSize,
                                    Float,
                                    decltype(in_chwn_global_desc),
@@ -125,6 +133,8 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn
                                    decltype(in_chwn_block_desc.GetLengths()),
                                    InBlockCopyThreadPerDims,
                                    InBlockCopyDataPerRead>{};
+#endif
+
 
         // blockwise wei copy
         //   format is [CPerBlock*Y*X,KPerBlock]
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
similarity index 99%
rename from src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
rename to src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
index 5b42c638e1..365697ecfd 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -36,7 +36,7 @@ template <index_t GridSize,
           index_t InBlockCopyDataPerRead,
           index_t WeiBlockCopyDataPerRead,
           index_t OutThreadCopyDataPerWrite>
-struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn_lds_double_buffer
+struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn_lds_double_buffer
 {
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
index 0db7368071..fa710b3845 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
@@ -2,8 +2,9 @@
 #include "common.hip.hpp"
 #include "ConstantTensorDescriptor.hip.hpp"
 #include "ConstantMatrixDescriptor.hip.hpp"
-#include "blockwise_4d_tensor_op.hip.hpp"
 #include "blockwise_2d_tensor_op.hip.hpp"
+#include "blockwise_3d_tensor_op.hip.hpp"
+#include "blockwise_4d_tensor_op.hip.hpp"
 #include "threadwise_nd_tensor_op.hip.hpp"
 #include "threadwise_4d_tensor_op.hip.hpp"
 #include "blockwise_batched_gemm.hip.hpp"
@@ -52,19 +53,19 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
 
-        constexpr auto in_chwn_global_desc  = InGlobalDesc{};
-        constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
-        constexpr auto out_khwn_global_desc = OutGlobalDesc{};
+        constexpr auto in_c_h_w_n_global_desc  = InGlobalDesc{};
+        constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
+        constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{};
 
-        constexpr index_t C = in_chwn_global_desc.GetLength(I0);
+        constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0);
 
-        constexpr index_t K  = out_khwn_global_desc.GetLength(I0);
-        constexpr index_t Ho = out_khwn_global_desc.GetLength(I1);
-        constexpr index_t Wo = out_khwn_global_desc.GetLength(I2);
-        constexpr index_t N  = out_khwn_global_desc.GetLength(I3);
+        constexpr index_t K  = out_k_h_w_n_global_desc.GetLength(I0);
+        constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1);
+        constexpr index_t Wo = out_k_h_w_n_global_desc.GetLength(I2);
+        constexpr index_t N  = out_k_h_w_n_global_desc.GetLength(I3);
 
-        constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
-        constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);
+        constexpr index_t Y = wei_c_y_x_k_global_desc.GetLength(I1);
+        constexpr index_t X = wei_c_y_x_k_global_desc.GetLength(I2);
 
         constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
         constexpr index_t WiPerBlock = WoPerBlock + X - 1;
@@ -94,84 +95,85 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
         const index_t hi_block_data_begin = ho_block_data_begin;
         const index_t wi_block_data_begin = wo_block_data_begin;
 
-        // 2d tensor view of gridwise weight
-        constexpr auto wei_ck_global_desc =
-            make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
+        // global tensor view
+        constexpr auto wei_c_x_k_global_desc =
+            make_ConstantTensorDescriptor(Sequence<C, X, K>{}, Sequence<Y * X * K, K, 1>{});
 
-        // tensor view of blockwise input and weight in LDS
+        // LDS tensor view
         //   be careful of alignment
         constexpr index_t max_align =
-            mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
+            mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead, GemmDataPerReadA, GemmDataPerReadB);
 
-        constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{}, Number<max_align>{});
+        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
+            Sequence<CPerBlock, HoPerBlock, WiPerBlock, NPerBlock>{}, Number<max_align>{});
 
-        constexpr auto wei_ck_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, KPerBlock>{}, Number<max_align>{});
+        constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned(
+            Sequence<CPerBlock, X, KPerBlock>{}, Number<max_align>{});
 
         // tensor view of threadwise output in register
-        constexpr auto out_khwn_thread_desc = make_ConstantTensorDescriptor(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
             Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
 
         // blockwise copy
         // input: format is [C, Hi, Wi, N]
         const auto blockwise_in_copy =
+#if 1
+            Blockwise4dTensorCopy1<BlockSize,
+                                   Float,
+                                   decltype(in_c_h_w_n_global_desc),
+                                   decltype(in_c_h_w_n_block_desc),
+                                   decltype(in_c_h_w_n_block_desc.GetLengths()),
+                                   InBlockCopyDataPerRead>{};
+#else
             Blockwise4dTensorCopy3<BlockSize,
                                    Float,
-                                   decltype(in_chwn_global_desc),
-                                   decltype(in_chwn_block_desc),
-                                   decltype(in_chwn_block_desc.GetLengths()),
+                                   decltype(in_c_h_w_n_global_desc),
+                                   decltype(in_c_h_w_n_block_desc),
+                                   decltype(in_c_h_w_n_block_desc.GetLengths()),
                                    InBlockCopyThreadPerDims,
                                    InBlockCopyDataPerRead>{};
-
-        // blockwise wei copy
-        //   format is [CPerBlock, KPerBlock]
-        const auto blockwise_wei_copy =
-#if 0 // debug
-            Blockwise2dTensorCopy1<BlockSize,
-                                   Float,
-                                   decltype(wei_ck_global_desc),
-                                   decltype(wei_ck_block_desc),
-                                   decltype(wei_ck_block_desc.GetLengths())>{};
-#else
-            Blockwise2dTensorCopy3<BlockSize,
-                                   Float,
-                                   decltype(wei_ck_global_desc),
-                                   decltype(wei_ck_block_desc),
-                                   decltype(wei_ck_block_desc.GetLengths()),
-                                   WeiBlockCopyDataPerRead>{};
 #endif
 
-            // a series of blockwise batched GEMM
-            // C_matrix += transpose(A_matrix) * B_matrix
-            //   A_matrix and B_matrix saved in LDS, C_matrix saved in register
-            //   A_matrix[C,K] is a sub-matrix of wei_block[C,K]
-            //   B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
-            //   C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
-            constexpr auto a_cxk_block_mtx_desc =
-                make_ConstantMatrixDescriptor(Number<CPerBlock>{},
-                                              Number<KPerBlock>{},
-                                              Number<wei_ck_block_desc.GetStride(I0)>{});
+        // blockwise wei copy
+        //   format is [CPerBlock, X * KPerBlock]
+        const auto blockwise_wei_copy =
+            Blockwise3dTensorCopy1<BlockSize,
+                                   Float,
+                                   decltype(wei_c_x_k_global_desc),
+                                   decltype(wei_c_x_k_block_desc),
+                                   decltype(wei_c_x_k_block_desc.GetLengths()),
+                                   WeiBlockCopyDataPerRead>{};
 
-        constexpr auto b_cxwn_block_mtx_desc =
+        // a series of blockwise batched GEMM
+        // C_matrix += transpose(A_matrix) * B_matrix
+        //   A_matrix and B_matrix saved in LDS, C_matrix saved in register
+        //   A_matrix[C,K] is a sub-matrix of wei_block[C,K]
+        //   B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
+        //   C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
+        constexpr auto a_c_k_block_mtx_desc =
+            make_ConstantMatrixDescriptor(Number<CPerBlock>{},
+                                          Number<KPerBlock>{},
+                                          Number<wei_c_x_k_block_desc.GetStride(I0)>{});
+
+        constexpr auto b_c_wn_block_mtx_desc =
             make_ConstantMatrixDescriptor(Number<CPerBlock>{},
                                           Number<WoPerBlock * NPerBlock>{},
-                                          Number<in_chwn_block_desc.GetStride(I0)>{});
+                                          Number<in_c_h_w_n_block_desc.GetStride(I0)>{});
 
-        constexpr auto c_kxwn_thread_mtx_desc =
+        constexpr auto c_k_wn_thread_mtx_desc =
             make_ConstantMatrixDescriptor(Number<KPerThread>{},
                                           Number<WoPerThread * NPerThread>{},
-                                          Number<out_khwn_thread_desc.GetStride(I0)>{});
+                                          Number<out_k_h_w_n_thread_desc.GetStride(I0)>{});
 
         const auto blockwise_batch_gemm =
             BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2<
                 BlockSize,
-                decltype(a_cxk_block_mtx_desc),
-                decltype(b_cxwn_block_mtx_desc),
-                decltype(c_kxwn_thread_mtx_desc),
+                decltype(a_c_k_block_mtx_desc),
+                decltype(b_c_wn_block_mtx_desc),
+                decltype(c_k_wn_thread_mtx_desc),
                 0,
-                in_chwn_block_desc.GetStride(I1),
-                out_khwn_thread_desc.GetStride(I1),
+                in_c_h_w_n_block_desc.GetStride(I1),
+                out_k_h_w_n_thread_desc.GetStride(I1),
                 HoPerBlock,
                 GemmMPerThreadSubC,
                 GemmNPerThreadSubC,
@@ -185,64 +187,64 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
                 GemmDataPerReadB>{};
 
         // LDS: be careful of alignment
-        constexpr index_t in_block_space  = in_chwn_block_desc.GetElementSpace(Number<max_align>{});
-        constexpr index_t wei_block_space = wei_ck_block_desc.GetElementSpace(Number<max_align>{});
+        constexpr index_t in_block_space  = in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
+        constexpr index_t wei_block_space = wei_c_x_k_block_desc.GetElementSpace(Number<max_align>{});
 
         __shared__ Float p_in_block[in_block_space];
         __shared__ Float p_wei_block[wei_block_space];
 
         // register
-        Float p_out_thread[out_khwn_thread_desc.GetElementSpace()];
+        Float p_out_thread[out_k_h_w_n_thread_desc.GetElementSpace()];
 
 #if 0
         if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
         {
-            print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
-            print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
-            print_ConstantTensorDescriptor(wei_ck_global_desc, "wei_ck_global_desc");
+            print_ConstantTensorDescriptor(in_c_h_w_n_global_desc, "in_c_h_w_n_global_desc");
+            print_ConstantTensorDescriptor(wei_c_y_x_k_global_desc, "wei_c_y_x_k_global_desc");
 
-            print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
-            print_ConstantTensorDescriptor(wei_ck_block_desc, "wei_ck_block_desc");
+            print_ConstantTensorDescriptor(in_c_h_w_n_block_desc, "in_c_h_w_n_block_desc");
+            print_ConstantTensorDescriptor(wei_c_x_k_block_desc, "wei_c_x_k_block_desc");
 
             printf("in_block_space %u, wei_block_space %u\n", in_block_space, wei_block_space);
         }
 #endif
 
         // set threadwise output tensor to 0
-        threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread);
+        threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
 
         const Float* p_in_global_block_offset =
             p_in_global +
-            in_chwn_global_desc.Get1dIndex(
+            in_c_h_w_n_global_desc.Get1dIndex(
                 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
 
         const Float* p_wei_global_block_offset =
-            p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+            p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
 
         for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
-                    p_in_global_block_offset += CPerBlock * in_chwn_global_desc.GetStride(I0),
-                    p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0))
+                    p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
+                    p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
         {
-            // input: global mem to LDS
-            blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
-
             for(index_t y = 0; y < Y; ++y)
             {
+                blockwise_in_copy.Run(p_in_global_block_offset +
+                                           in_c_h_w_n_global_desc.Get1dIndex(0, y, 0, 0),
+                                      p_in_block);
+
+                blockwise_wei_copy.Run(p_wei_global_block_offset +
+                                           wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0),
+                                       p_wei_block);
+
+                __syncthreads();
+
                 for(index_t x = 0; x < X; ++x)
                 {
-                    // weight: global mem to LDS
-                    blockwise_wei_copy.Run(p_wei_global_block_offset +
-                                               wei_cyxk_global_desc.Get1dIndex(0, y, x, 0),
-                                           p_wei_block);
-
-                    __syncthreads();
-
-                    blockwise_batch_gemm.Run(p_wei_block,
-                                             p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0),
+                    blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
+                                             p_in_block + in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
                                              p_out_thread);
 
-                    __syncthreads();
                 }
+
+                __syncthreads();
             }
         }
 
@@ -324,7 +326,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
             p_out_thread,
             out_10d_global_desc,
             p_out_global +
-                out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
                                                 ho_block_data_begin + ho_thread_data_begin,
                                                 wo_block_data_begin + wo_thread_data_begin,
                                                 n_block_data_begin + n_thread_data_begin),
diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
index b301fc1e52..cbebe28f17 100644
--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -125,7 +125,7 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
     constexpr index_t HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
     constexpr index_t WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
 
-    const index_t thread_id = threadIdx.x;
+    const index_t thread_id = get_thread_local_1d_id();
 
     itmp                           = thread_id;
     const index_t n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index 250253f2ff..0b83eccc3c 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -137,7 +137,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
     constexpr index_t HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
     constexpr index_t WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
 
-    const index_t thread_id = threadIdx.x;
+    const index_t thread_id = get_thread_local_1d_id();
 
     itmp                           = thread_id;
     const index_t n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
diff --git a/src/include/threadwise_2d_tensor_op.hip.hpp b/src/include/threadwise_2d_tensor_op.hip.hpp
index 6e25b61b73..f8b8f722e3 100644
--- a/src/include/threadwise_2d_tensor_op.hip.hpp
+++ b/src/include/threadwise_2d_tensor_op.hip.hpp
@@ -10,7 +10,7 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re
     constexpr auto desc = Desc{};
 
 #if 0
-    if(threadIdx.x == 0)
+    if(get_thread_local_1d_id() == 0)
     {
         print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: ");
     }
@@ -112,7 +112,7 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
     constexpr auto desc = Desc{};
 
 #if 0
-    if(threadIdx.x == 0)
+    if(get_thread_local_1d_id() == 0)
     {
         print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_shift_down: ");
     }
diff --git a/src/include/threadwise_4d_tensor_op.hip.hpp b/src/include/threadwise_4d_tensor_op.hip.hpp
index 19ab68d013..21fed4d286 100644
--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
@@ -12,7 +12,7 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
     constexpr auto desc = Desc{};
 
 #if 0
-    if(threadIdx.x == 0)
+    if(get_thread_local_1d_id() == 0)
     {
         print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: ");
     }
@@ -218,7 +218,7 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
     constexpr auto desc = Desc{};
 
 #if 0
-    if(threadIdx.x == 0)
+    if(get_thread_local_1d_id() == 0)
     {
         print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_shift_down: ");
     }
diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp
index 9b90c402a1..1c033573b9 100644
--- a/src/include/threadwise_direct_convolution.hip.hpp
+++ b/src/include/threadwise_direct_convolution.hip.hpp
@@ -20,7 +20,7 @@ __device__ void threadwise_direct_convolution_1(InDesc,
     constexpr auto out_desc = OutDesc{};
 
 #if 0
-    if(blockIdx.x == 0 && threadIdx.x == 0)
+    if(blockIdx.x == 0 && get_thread_local_1d_id() == 0)
     {
         print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution: in_desc: ");
         print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution: wei_desc: ");