diff --git a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
index 3e09a04fac..eaa8df4ca4 100644
--- a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp
@@ -27,7 +27,7 @@ using DeviceGemmV2Instance =
         ALayout,   BLayout,  CLayout,   
         ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
         AElementOp, BElementOp, CElementOp, GemmDefault, 
-#if 0
+#if 1
         64,
         16, 16, 
         256, 8, 16,
diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc
index c3f4f58439..1b042fe0ee 100644
--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -256,7 +256,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
       //                             get_rtol<CDataType>(),
       //                             get_atol<CDataType>());
 
-      LogRangeAsType<float>(std::cout << "c_m_n_device_buf : ", c_m_n_device_result.mData, ",") << std::endl;
+      //for(int i = 0; i < M; i++)
+      //{
+      //    for(int j = 0; j < N; j++)
+      //    {
+      //        std::cout << ck::type_convert<float>(c_m_n_device_result(i, j)) << ",";
+      //    }
+      //    std::cout << std::endl;
+      //}
 #endif
     }
 
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 618813b781..83d2193eef 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -25,7 +25,7 @@ struct PassThroughPack2
 
     __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
 	{
-#if 1
+#if 0
 		uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
 		uint8_t x_l  = (x_u8 & 0x0f) >> 0;
 		uint8_t x_h  = (x_u8 & 0xf0) >> 4;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index e89736d7c5..1037ec410f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -151,6 +151,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
     {
         return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
@@ -625,9 +639,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
         // in some cases.
         else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
         {
-            constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
-                                                  ? 1
-                                                  : 32 * 4 / KPerBlock / sizeof(ADataType);
+            constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(ADataType);
+            constexpr auto MLdsLayer =  LdsSize < 1 ? 1 : LdsSize;
             constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
                 make_tuple(
                     AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
@@ -761,10 +774,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
         else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
         {
             // NLdsLayer * K0 as logical Bank
-            constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
-                                           ? 1
-                                           : 32 * 4 / KPerBlock / sizeof(BDataType);
-            ;
+            constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(BDataType);
+            constexpr auto NLdsLayer =   LdsSize < 1 ? 1 : LdsSize;
             constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
                 make_tuple(
                     BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
@@ -924,20 +935,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                 NXdlPerWave,
                                 KPack>())>;
 
-    static constexpr index_t APackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
-            return 2;
-        else
-            return 1;
-    }();
-
-    static constexpr index_t BPackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
-            return 2;
-        else
-            return 1;
-    }();
-
     __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
         // LDS allocation for A and B: be careful of alignment
@@ -1326,8 +1323,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
             static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
 
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            bit_cast<BDataType*>(bit_cast<unsigned char *>(p_shared) +
-                a_block_space_size_aligned * sizeof(ADataType)),
+            reinterpret_cast<BDataType*>(static_cast<ADataType*>(p_shared) +
+                a_block_space_size_aligned),
             b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index c4b96def41..a0a0e7f230 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1211,10 +1211,6 @@ struct ThreadwiseTensorSliceTransfer_v4
                         dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
 
                     dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
-
-
-                    if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
-                        printf("v4: %f %d\n", type_convert<float>(dst_buf[Number<dst_offset>{}]), threadIdx.x);
                 });
             }
         });
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
index 6beaf9155a..6b4b2a8a04 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -554,9 +554,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                 dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
 
                 dst_vector_container.template AsType<DstData>()(i) = dst_v;
-
-                //if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
-                    //printf("v3r1: %f %d\n", type_convert<float>(dst_v), threadIdx.x);
             });
 
             // copy data from dst_vector_container to dst_buf
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index f2db1c95a9..d8ccb2ea76 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -157,16 +157,6 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
     template <class FloatC>
     __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
     {
-        auto tmp_a = vector_type<half_t, 4>{reg_a};
-        auto tmp_b = vector_type<half_t, 4>{reg_b};
-        printf("{%f %f}, {%f %f}, {%f %f}, {%f %f} %d %d\n", 
-        static_cast<float>(tmp_a.template AsType<half_t>()(Number<0>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<0>{})),
-        static_cast<float>(tmp_a.template AsType<half_t>()(Number<1>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<1>{})),
-        static_cast<float>(tmp_a.template AsType<half_t>()(Number<2>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<2>{})),
-        static_cast<float>(tmp_a.template AsType<half_t>()(Number<3>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<3>{})),
-        threadIdx.x, blockIdx.x
-        );
-
         reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
             reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
     }