diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index 83e8c7e78f..7a04c93ba1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -1181,6 +1181,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                        block_idx, iter_start, iter_end, is_sk_block, is_dp_block);
             }
 
+            __syncthreads();
+
             while(true)
             {
                 uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
@@ -1211,6 +1213,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                     MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                         c_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
+                // Emin @added
+                __syncthreads();
+
                 // Emin @debug
                 // Debug: Print grid descriptor sizes
                 if (threadIdx.x == 0 && threadIdx.y == 0) {
@@ -1220,6 +1225,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                            c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
                 }
 
+                // Emin @added
+                __syncthreads();
+
                 // Create dynamic buffers for A, B, C matrices in global memory       
                 auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                     p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
@@ -1248,6 +1256,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                 const index_t k0_block_data_idx_on_grid =
                     __builtin_amdgcn_readfirstlane(iter_offset * AK0Number);
 
+                // Emin @added
+                __syncthreads();
+
                 // Emin @debug
                 // Debug: Print block data indices on grid
                 if (threadIdx.x == 0 && threadIdx.y == 0) {
@@ -1255,6 +1266,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                            m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid);
                 }
 
+
+                // Emin @added
+                __syncthreads();
+
                 // lds max alignment
                 constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
 
@@ -1367,12 +1382,18 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                     (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
                     KPerBlock);
 
+                // Emin @added
+                __syncthreads();
+
                 // Emin @debug
                 // Debug: Print number of K blocks in main loop
                 if (threadIdx.x == 0 && threadIdx.y == 0) {
                     printf("Number of K Blocks in Main Loop: %d\n", num_k_block_main_loop);
                 }
 
+                // Emin @added
+                __syncthreads();
+
                 blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
                     a_grid_desc_ak0_m_ak1,
                     a_block_desc_ak0_m_ak1,
@@ -1564,13 +1585,19 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         // make sure it's safe to write to LDS
                         block_sync_lds();
 
+                          // Emin @added
+                        __syncthreads();
+
                         // Emin @debug
                         // Debug: Print before writing C to LDS
                         if (threadIdx.x == 0 && threadIdx.y == 0) {
-                            printf("Gridwise_gemm_sk line 1570 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
+                            printf("Gridwise_gemm_sk line 1594 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
                         }
 
 
+                         // Emin @added
+                        __syncthreads();
+
                         // each thread write its data from VGPR to LDS
                         c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                                       sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
@@ -1598,10 +1625,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         }
                         else if(is_sk_block)
                         {
+                            // Emin @added
+                            __syncthreads();
+
                             if (threadIdx.x == 0 && threadIdx.y == 0) {
                                 printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n");
                             }
 
+                            // Emin @added
+                            __syncthreads();
+
+
                             // each block copy its data from LDS to global
                             c_shuffle_block_copy_lds_to_global
                                 .template Run<decltype(c_shuffle_block_buf),
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
index fb6dae2527..3bdf953468 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
@@ -106,8 +106,8 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
              // Debug: Print source vector data if valid
             if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) {
                 // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>());
-                // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
-                printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
+                printf("BlockId %d -  Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
+                // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
             }
 
             // apply pointwise operation
@@ -123,25 +123,43 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
                 //     printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
                 // }
 
+                // Emin @added
+                __syncthreads();
+
+
                  // Debug: Print SrcData before and after applying element-wise operation
                 if (threadIdx.x == 0 && threadIdx.y == 0) {
                     // printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
-                    printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n",static_cast<int>(idx_1d.value),  static_cast<int>(i.value), src_vector_container.template AsType<SrcData>().At(Number<i>{}));
+                    // printf("BlockId %d -  Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n", static_cast<int>(blockIdx.x) ,  static_cast<int>(idx_1d.value),  static_cast<int>(i.value), src_vector_container.template AsType<SrcData>().At(Number<i>{}));
+                    // // printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
+                    // printf("BlockId %d -  Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n" , static_cast<int>(blockIdx.x) ,  static_cast<int>(idx_1d.value) , static_cast<int>(i.value), v);
+                
+                    printf("BlockId %d -  Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %f \n", static_cast<int>(blockIdx.x) ,  static_cast<int>(idx_1d.value),  static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
                     // printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
-                    printf("Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n", static_cast<int>(idx_1d.value) , static_cast<int>(i.value), v);
+                    printf("BlockId %d -  Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %f \n" , static_cast<int>(blockIdx.x) ,  static_cast<int>(idx_1d.value) , static_cast<int>(i.value), static_cast<float>(v));
                 }
 
+                // Emin @added
+                __syncthreads();
+
                 // apply type convert
                 dst_vector_container.template AsType<DstData>()(i) = type_convert<DstData>(v);
 
+                // Emin @added
+                __syncthreads();
+
                 // Emin @debug
                 // Debug: Print type conversion result
                 if (threadIdx.x == 0 && threadIdx.y == 0) {
                     // printf("Threadwise_tensor slice v6r1r2 line 121 : Type Conversion Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>()[i]));
                     //   printf("DstData after type conversion at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>().At(Number<i>{})));
-                    printf("Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i  %d: %hu \n", static_cast<int>(idx_1d.value) , static_cast<int>(i.value), dst_vector_container.template AsType<DstData>().At(Number<i>{}));
+                    // printf("BlockId %d -  Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i  %d: %hu \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), dst_vector_container.template AsType<DstData>().At(Number<i>{}));
                 
+                    printf("BlockId %d -  Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i  %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>().At(Number<i>{})));
                 }
+
+                // Emin @added
+                __syncthreads();
             });
 
             const bool is_dst_valid =
@@ -153,9 +171,12 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
                 is_dst_valid,
                 dst_vector_container.template AsType<dst_vector_t>()[I0]);
 
-            //  // Debug: Print data before copying from dst_vector into dst_buf
+            // // //  // Debug: Print data before copying from dst_vector into dst_buf
             // if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
-            //     printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<dst_vector_t>().At(I0));
+            //     // printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<DstData>().At(I0));
+            //     // printf("BlockId %d -  Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
+
+            //     printf("BlockId %d -  Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
             // }
 
             // move coordinate
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 5e48805b22..0d879d440d 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -15,17 +15,24 @@ else
 fi
 
 cmake                                                                                             \
--D CMAKE_PREFIX_PATH=/opt/rocm-6.2.1/                                                                 \
--D CMAKE_CXX_COMPILER=/opt/rocm-6.2.1/bin/hipcc                                                        \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                 \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                        \
 -D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
--D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
+-D USE_BITINT_EXTENSION_INT4=OFF             \
+-D CK_LOGGING=ON                                                     \
 $REST_ARGS                                                                                        \
 ${MY_PROJECT_SOURCE}
 
 
 # -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
-# -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc     
\ No newline at end of file
+# -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc     
+
+
+# -D CMAKE_PREFIX_PATH=/opt/rocm-6.2.1/                                                                 \
+# -D CMAKE_CXX_COMPILER=/opt/rocm-6.2.1/bin/hipcc   
+
+# -D CK_LOGGING=ON
\ No newline at end of file