From 0ea3268d5de1870c27f64aa9f34dbbaf8fa772ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Thu, 25 Sep 2025 09:41:33 +0000 Subject: [PATCH] Remove debug and other dead code. --- .../ops/epilogue/cshuffle_epilogue.hpp | 121 +----------------- ...ped_convolution_backward_weight_kernel.hpp | 62 --------- 2 files changed, 2 insertions(+), 181 deletions(-) diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index 5cc9f37cf0..1d2208ac39 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -287,31 +287,6 @@ struct CShuffleEpilogue } } - template - CK_TILE_DEVICE void print_tensor_matrix_format( - const static_distributed_tensor& tensor, - const char* /*name = "tensor_matrix"*/) - { - const auto spans = tensor.get_distributed_spans(); - //static_assert(spans.size() == 2, "This function is for 2D tensors only"); - - const auto dim0_span = spans[number<0>{}]; - const auto dim1_span = spans[number<1>{}]; - - //printf("%s matrix format (tid %u):\n", name, threadIdx.x); - - sweep_tile_span(dim0_span, [&](auto row) { - printf(" "); - sweep_tile_span(dim1_span, [&](auto col) { - constexpr auto distributed_indices = make_tuple(row, col); - const auto value = tensor[distributed_indices]; - printf("tid %u: %.7f\n", threadIdx.x, static_cast(value)); - }); - //printf("\n"); - }); - //printf("\n"); - } - template CK_TILE_DEVICE auto merged_op(ODramWindow& out_dram_window, const OAccTile& o_acc_tile, @@ -346,6 +321,8 @@ struct CShuffleEpilogue constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t{}; // Store full data to LDS. + // TODO: No need to store the full data, only the diagnoal blocks are needed. + // Note that in the current data layout, it is not easy to store only the diagonal blocks. block_sync_lds(); static_for<0, num_access, 1>{}([&](auto iAccess) { @@ -419,13 +396,8 @@ struct CShuffleEpilogue {0, 0}, dram_tile_distribution); - // Load static_distributed_tensor from LDS. auto c_out_tensor = load_tile(lds_window); - // DEBUG: Print out the c_out_tensor contents for debugging - // print_tensor_matrix_format(c_out_tensor, "c_out_tensor"); - // __syncthreads(); - const auto ds_tensor = generate_tuple( [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number{}); @@ -455,95 +427,6 @@ struct CShuffleEpilogue }); } }); - - - //--------------------------------------------------------------------- - - - // // 4D tensor view of LDS memory - // // with (g_i, g_j, i, j) where (g_i, g_j) is the group index - // // and (i, j) is the index within the group. - // constexpr auto lds_desc_4d = make_naive_tensor_descriptor( - // make_tuple(number{}, number{}, number{}, number{}), - // make_tuple(number{}, number<1>{}, number{}, number{})); - - // // We must merge (r,m) and (c,n) dimensions together to make a 2D tensor descriptor. - // constexpr auto lds_desc = transform_tensor_descriptor( - // lds_desc_4d, - // make_tuple( - // make_merge_transform(make_tuple(Gs, MPerGroup)), - // make_merge_transform(make_tuple(Gs, NPerGroup)) - // ), - // make_tuple(sequence<0, 2>{}, sequence<1, 3>{}), - // make_tuple(sequence<0>{}, sequence<1>{}) - // ); - - // auto lds_view = make_tensor_view( - // static_cast(p_smem), lds_desc); - - // constexpr auto dram_tile_encoding = tile_distribution_encoding< - // sequence<>, - // tuple, - // sequence<1, Gs, NPerGroup, 1>>, - // tuple, sequence<1,2>>, - // tuple, sequence<2,2>>, - // sequence<1, 1, 2, 2>, - // sequence<0, 3, 0, 3>>{}; - - // constexpr auto dram_tile_distribution = make_static_tile_distribution(dram_tile_encoding); - - // auto d_dram_windows = generate_tuple( - // [&](auto idx) { - // return make_tile_window(ds_dram_windows[idx], dram_tile_distribution); - // }, - // number{}); - - // // Calculate which block in the Gs x Gs space we are located at. - // const auto x_space_coord = dram_tile_distribution.calculate_index(); - // const index_t m_block = x_space_coord[0] / MPerGroup; - // const index_t n_block = x_space_coord[1] / NPerGroup; - - // const auto current_lds_window = make_tile_window( - // lds_view, - // make_tuple(number{}, number{}), - // {0, 0}, - // dram_tile_distribution); - - // // Copy only the diagonal blocks. - // if (m_block == n_block) - // { - // // Load static_distributed_tensor from LDS. - // auto c_out_tensor = load_tile(current_lds_window); - - // // DEBUG: Print out the c_out_tensor contents for debugging - // print_tensor_matrix_format(c_out_tensor, "c_out_tensor"); - // __syncthreads(); - - // // TODO: We must move the d_dram_windows to the correct group position. - // const auto ds_tensor = generate_tuple( - // [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number{}); - - // const auto c_ds_tiles = concat_tuple_of_reference( - // tie(c_out_tensor, c_out_tensor), - // generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; }, - // number{})); - - // tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles); - - // // Move the output window to the correct position. - // //printf("m_block: %d, n_block: %d \n", m_block, n_block); - // //auto out_window = make_tile_window(out_dram_window, dram_tile_distribution); - // //move_tile_window(out_window, {m_block * MPerGroup, 0}); - - // if constexpr(MemoryOperation == memory_operation_enum::set) - // { - // store_tile(out_window, c_out_tensor); - // } - // else - // { - // update_tile(out_window, c_out_tensor); - // } - // } } template diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index 7858a4ac42..d1c8452c5e 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -797,70 +797,8 @@ struct GroupedConvolutionBackwardWeightKernel // Run Epilogue Pipeline auto& c_block_window = gemm_tile_windows.at(I3); - // For debugging - results in very slow compilation. - // if (blockIdx.x == 0 && threadIdx.x == 0) - // { - // const auto c_block_tile_distribution = c_block_tile.get_tile_distribution(); - // print(c_block_tile_distribution); - // } - EpiloguePipeline{}.template operator()( c_block_window, c_block_tile, d_block_window, smem_ptr_0); - - //constexpr index_t MBlockWidth = TilePartitioner::MPerBlock / GroupedConvTraitsType_::NumGroupsToMerge; - //constexpr index_t NBlockWidth = TilePartitioner::NPerBlock / GroupedConvTraitsType_::NumGroupsToMerge; - - //Run LDS to global memory manually, one thread per convolution group. - // if (blockIdx.x == 0 && threadIdx.x < GroupedConvTraitsType_::NumGroupsToMerge) - // { - // const auto group_index = threadIdx.x; - // const index_t c_ptr_offset = group_index * MBlockWidth * NBlockWidth; - // OutDataType* lds_data = reinterpret_cast(smem_ptr_0); - // for (auto i_loc = 0; i_loc < NBlockWidth; ++i_loc) - // { - // const auto lds_index = (group_index * NBlockWidth + i_loc) * TilePartitioner::MPerBlock + group_index; - // c_ptr[c_ptr_offset + i_loc] = lds_data[lds_index]; - // } - // } - - // __syncthreads(); - // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && threadIdx.y == 0) - // { - // constexpr index_t Gs = GroupedConvTraitsType_::NumGroupsToMerge; - // constexpr index_t NBlockWidth = TilePartitioner::NPerBlock / Gs; - - // // Print out LDS contents. - // // The LDS corresponds TilePartitioner_::MPerBlock * TilePartitioner_::NPerBlock matrix. - // // Print LDS contents as matrix - // printf("LDS Contents (%d x %d):\n", TilePartitioner::MPerBlock, TilePartitioner::NPerBlock); - // OutDataType* lds_data = reinterpret_cast(smem_ptr_0); - - // for(int c = 0; c < Gs; ++c) { - // printf("Block %d:\n", c); - // for(int r = 0; r < Gs; ++r) { - // printf("Row %d: ", r); - // for(int n = 0; n < NBlockWidth; ++n) - // { - // int idx = (r * NBlockWidth + n) * TilePartitioner::MPerBlock + c; - // printf("%.7f ", static_cast(lds_data[idx])); - // } - // printf(" \n"); - // } - // printf("\n\n"); - // } - - // // Print out the c_block_window contents for debugging - // printf("C Ptr Contents (%d x %d):\n", TilePartitioner::MPerBlock, NBlockWidth); - // for(int m = 0; m < TilePartitioner::MPerBlock; ++m) { - // for(int n = 0; n < NBlockWidth; ++n) { - // int idx = m * NBlockWidth + n; - // printf("%.7f ", static_cast(c_ptr[idx])); - // if((n + 1) % NBlockWidth == 0) printf("\n "); // Line break every NBlockWidth elements for readability - // } - // printf("\n"); - // } - // } - // __syncthreads(); } /**