diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
index c41be1e62a..6bda3cb511 100644
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -39,6 +39,9 @@ using DeviceGemmV2Instance =
         2, 32, 32, 1,
         1, 1, S<1, 16, 1, 4>, 4,
         ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
+
+    static int NPerBlock = 16;
+    static int KPerBlock = 256;
 #else
         128,
         16, 32, 
@@ -51,8 +54,11 @@ using DeviceGemmV2Instance =
         2, 32, 32, 0,
         1, 1, S<1, 16, 1, 8>, 4,
         ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
+
+    static int NPerBlock = 32;
+    static int KPerBlock = 128;
 #endif
-      // clang-format on
+// clang-format on
 
     using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                             BDataType,
@@ -146,30 +152,37 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
-#if 1
-    int NPerBlock = 32;
-    int KPerBlock = 128;
-
+//weight permute
+#if 0
     int N1 = NPerBlock;
     int K1 = KPerBlock;
 
     int N0 = N / N1;
     int K0 = K / K1;
+    int K01 = K0 / KBatch;
+    int K00 = KBatch;
 
-    for(int i = 0; i < N0; i++)
+    std::cout << "K00 = " << K00 << " K01 = " << K01 << std::endl;
+
+    for(int k = 0; k < K00; k++)
     {
-        for(int j = 0; j < K0; j++)
+        for(int i = 0; i < N0; i++)
         {
-            for(int ii = 0; ii < N1; ii++)
+            for(int j = 0; j < K01; j++)
             {
-                for(int jj = 0; jj < K1; jj++)
+                for(int ii = 0; ii < N1; ii++)
                 {
-                    b_k_n_permute(i * K0 * N1 * K1 + j * N1 * K1 + ii * K1 + jj) =
-                        b_k_n((i * N1 + ii) * K + (j * K1 + jj));
+                    for(int jj = 0; jj < K1; jj++)
+                    {
+                        b_k_n_permute(k * N0 * K01 * N1 * K1 + i * K01 * N1 * K1 + j * N1 * K1 + ii * K1 + jj) =
+                            b_k_n((i * N1 + ii) * K + (k * K01 * K1 + j * K1 + jj));
+                    }
                 }
             }
         }
     }
+
+    
 #else
     for(int i = 0; i < N; i++)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 612a9a3b04..3b34a591a3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -14,6 +14,8 @@
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
+//#define WEIGHT_PERMUTE
+
 namespace ck {
 
 // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
@@ -387,8 +389,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
         }
         else
         {
-            // B Tile Permute
-#if 0
+            // Weight Tile Permute
+#ifndef WEIGHT_PERMUTE
             // not pad N or K
             const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
                 b_grid_desc_nraw_kraw,
@@ -619,10 +621,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
             {
-#if 0
+#ifndef WEIGHT_PERMUTE
                 b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
 #else
-                const int k0_offset = karg.KRead * NPerBlock;
+                const int k0_offset = karg.KRead * karg.N;
                 b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
 #endif
             }