diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp index 83e8c7e78f..7a04c93ba1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp @@ -1181,6 +1181,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 block_idx, iter_start, iter_end, is_sk_block, is_dp_block); } + __syncthreads(); + while(true) { uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( @@ -1211,6 +1213,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( c_grid_desc_m_n, problem.MBlock, problem.NBlock); + // Emin @added + __syncthreads(); + // Emin @debug // Debug: Print grid descriptor sizes if (threadIdx.x == 0 && threadIdx.y == 0) { @@ -1220,6 +1225,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); } + // Emin @added + __syncthreads(); + // Create dynamic buffers for A, B, C matrices in global memory auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); @@ -1248,6 +1256,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 const index_t k0_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(iter_offset * AK0Number); + // Emin @added + __syncthreads(); + // Emin @debug // Debug: Print block data indices on grid if (threadIdx.x == 0 && threadIdx.y == 0) { @@ -1255,6 +1266,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid); } + + // Emin @added + __syncthreads(); + // lds max alignment constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number); @@ -1367,12 +1382,18 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / KPerBlock); + // Emin @added + __syncthreads(); + // Emin @debug // Debug: Print number of K blocks in main loop if (threadIdx.x == 0 && threadIdx.y == 0) { printf("Number of K Blocks in Main Loop: %d\n", num_k_block_main_loop); } + // Emin @added + __syncthreads(); + blockwise_gemm_pipeline.template Run( a_grid_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1, @@ -1564,13 +1585,19 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 // make sure it's safe to write to LDS block_sync_lds(); + // Emin @added + __syncthreads(); + // Emin @debug // Debug: Print before writing C to LDS if (threadIdx.x == 0 && threadIdx.y == 0) { - printf("Gridwise_gemm_sk line 1570 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast(access_id)); + printf("Gridwise_gemm_sk line 1594 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast(access_id)); } + // Emin @added + __syncthreads(); + // each thread write its data from VGPR to LDS c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, sfc_c_vgpr.GetIndexTupleOfNumber(access_id), @@ -1598,10 +1625,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 } else if(is_sk_block) { + // Emin @added + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n"); } + // Emin @added + __syncthreads(); + + // each block copy its data from LDS to global c_shuffle_block_copy_lds_to_global .template Run(idx_1d.value), static_cast()); - // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast(idx_1d.value), static_cast(src_vector_container.template AsType().At(Number<0>{}))); - printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast(idx_1d.value), src_vector_container.template AsType().At(Number<0>{})); + printf("BlockId %d - Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast(blockIdx.x) , static_cast(idx_1d.value), static_cast(src_vector_container.template AsType().At(Number<0>{}))); + // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast(idx_1d.value), src_vector_container.template AsType().At(Number<0>{})); } // apply pointwise operation @@ -123,25 +123,43 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 // printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast(i.value), static_cast(v)); // } + // Emin @added + __syncthreads(); + + // Debug: Print SrcData before and after applying element-wise operation if (threadIdx.x == 0 && threadIdx.y == 0) { // printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast(i.value), static_cast(src_vector_container.template AsType().At(Number{}))); - printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n",static_cast(idx_1d.value), static_cast(i.value), src_vector_container.template AsType().At(Number{})); + // printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n", static_cast(blockIdx.x) , static_cast(idx_1d.value), static_cast(i.value), src_vector_container.template AsType().At(Number{})); + // // printf("SrcData after element-wise op at idx %d: %f \n", static_cast(i.value), static_cast(v)); + // printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n" , static_cast(blockIdx.x) , static_cast(idx_1d.value) , static_cast(i.value), v); + + printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %f \n", static_cast(blockIdx.x) , static_cast(idx_1d.value), static_cast(i.value), static_cast(src_vector_container.template AsType().At(Number{}))); // printf("SrcData after element-wise op at idx %d: %f \n", static_cast(i.value), static_cast(v)); - printf("Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n", static_cast(idx_1d.value) , static_cast(i.value), v); + printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %f \n" , static_cast(blockIdx.x) , static_cast(idx_1d.value) , static_cast(i.value), static_cast(v)); } + // Emin @added + __syncthreads(); + // apply type convert dst_vector_container.template AsType()(i) = type_convert(v); + // Emin @added + __syncthreads(); + // Emin @debug // Debug: Print type conversion result if (threadIdx.x == 0 && threadIdx.y == 0) { // printf("Threadwise_tensor slice v6r1r2 line 121 : Type Conversion Result at idx %d: %f\n", static_cast(i.value), static_cast(dst_vector_container.template AsType()[i])); // printf("DstData after type conversion at idx %d: %f \n", static_cast(i.value), static_cast(dst_vector_container.template AsType().At(Number{}))); - printf("Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %hu \n", static_cast(idx_1d.value) , static_cast(i.value), dst_vector_container.template AsType().At(Number{})); + // printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %hu \n", static_cast(blockIdx.x) , static_cast(idx_1d.value) , static_cast(i.value), dst_vector_container.template AsType().At(Number{})); + printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %f \n", static_cast(blockIdx.x) , static_cast(idx_1d.value) , static_cast(i.value), static_cast(dst_vector_container.template AsType().At(Number{}))); } + + // Emin @added + __syncthreads(); }); const bool is_dst_valid = @@ -153,9 +171,12 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 is_dst_valid, dst_vector_container.template AsType()[I0]); - // // Debug: Print data before copying from dst_vector into dst_buf + // // // // Debug: Print data before copying from dst_vector into dst_buf // if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) { - // printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast(idx_1d.value), dst_buf.template AsType().At(I0)); + // // printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast(idx_1d.value), dst_buf.template AsType().At(I0)); + // // printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast(blockIdx.x) , static_cast(idx_1d.value), dst_buf.template Get(dst_coord_.GetOffset(), is_dst_valid)); + + // printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast(blockIdx.x) , static_cast(idx_1d.value), dst_buf.template Get(dst_coord_.GetOffset(), is_dst_valid)); // } // move coordinate diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index 5e48805b22..0d879d440d 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -15,17 +15,24 @@ else fi cmake \ --D CMAKE_PREFIX_PATH=/opt/rocm-6.2.1/ \ --D CMAKE_CXX_COMPILER=/opt/rocm-6.2.1/bin/hipcc \ +-D CMAKE_PREFIX_PATH=/opt/rocm \ +-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_BUILD_TYPE=Release \ -D BUILD_DEV=ON \ -D GPU_TARGETS=$GPU_TARGETS \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ --D USE_BITINT_EXTENSION_INT4=OFF \ +-D USE_BITINT_EXTENSION_INT4=OFF \ +-D CK_LOGGING=ON \ $REST_ARGS \ ${MY_PROJECT_SOURCE} # -D CMAKE_PREFIX_PATH=/opt/rocm \ -# -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ No newline at end of file +# -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc + + +# -D CMAKE_PREFIX_PATH=/opt/rocm-6.2.1/ \ +# -D CMAKE_CXX_COMPILER=/opt/rocm-6.2.1/bin/hipcc + +# -D CK_LOGGING=ON \ No newline at end of file