From 0ea3268d5de1870c27f64aa9f34dbbaf8fa772ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <ville.pietila@amd.com>
Date: Thu, 25 Sep 2025 09:41:33 +0000
Subject: [PATCH] Remove debug and other dead code.

---
 .../ops/epilogue/cshuffle_epilogue.hpp        | 121 +-----------------
 ...ped_convolution_backward_weight_kernel.hpp |  62 ---------
 2 files changed, 2 insertions(+), 181 deletions(-)
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 5cc9f37cf0..1d2208ac39 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -287,31 +287,6 @@ struct CShuffleEpilogue
         }
     }
 
-    template <typename DataType, typename StaticTileDistribution>
-    CK_TILE_DEVICE void print_tensor_matrix_format(
-        const static_distributed_tensor<DataType, StaticTileDistribution>& tensor,
-        const char* /*name = "tensor_matrix"*/)
-    {
-        const auto spans = tensor.get_distributed_spans();
-        //static_assert(spans.size() == 2, "This function is for 2D tensors only");
-        
-        const auto dim0_span = spans[number<0>{}];
-        const auto dim1_span = spans[number<1>{}];
-        
-        //printf("%s matrix format (tid %u):\n", name, threadIdx.x);
-        
-        sweep_tile_span(dim0_span, [&](auto row) {
-            printf("  ");
-            sweep_tile_span(dim1_span, [&](auto col) {
-                constexpr auto distributed_indices = make_tuple(row, col);
-                const auto value = tensor[distributed_indices];
-                printf("tid %u: %.7f\n", threadIdx.x, static_cast<float>(value));
-            });
-            //printf("\n");
-        });
-        //printf("\n");
-    }
-
     template <typename ODramWindow, typename OAccTile, typename DsDramWindows>
     CK_TILE_DEVICE auto merged_op(ODramWindow& out_dram_window,
                                    const OAccTile& o_acc_tile,
@@ -346,6 +321,8 @@ struct CShuffleEpilogue
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // Store full data to LDS.
+        // TODO: No need to store the full data, only the diagnoal blocks are needed.
+        // Note that in the current data layout, it is not easy to store only the diagonal blocks.
         block_sync_lds();
         static_for<0, num_access, 1>{}([&](auto iAccess) {
             
@@ -419,13 +396,8 @@ struct CShuffleEpilogue
                 {0, 0},
                 dram_tile_distribution);
 
-            // Load static_distributed_tensor from LDS.
             auto c_out_tensor = load_tile(lds_window);
 
-            // DEBUG: Print out the c_out_tensor contents for debugging
-            // print_tensor_matrix_format(c_out_tensor, "c_out_tensor");
-            // __syncthreads();
-
             const auto ds_tensor = generate_tuple(
                 [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number<NumDTensor>{});
 
@@ -455,95 +427,6 @@ struct CShuffleEpilogue
                 });
             }
         });
-
-
-        //---------------------------------------------------------------------
-
-
-        // // 4D tensor view of LDS memory
-        // // with (g_i, g_j, i, j) where (g_i, g_j) is the group index
-        // // and (i, j) is the index within the group.
-        // constexpr auto lds_desc_4d = make_naive_tensor_descriptor(
-        //     make_tuple(number<Gs>{}, number<Gs>{}, number<MPerGroup>{}, number<NPerGroup>{}),
-        //     make_tuple(number<Gs * MPerGroup * NPerGroup>{}, number<1>{}, number<Gs>{}, number<Gs * MPerGroup>{}));
-
-        // // We must merge (r,m) and (c,n) dimensions together to make a 2D tensor descriptor.
-        // constexpr auto lds_desc = transform_tensor_descriptor(
-        //     lds_desc_4d, 
-        //     make_tuple(
-        //         make_merge_transform(make_tuple(Gs, MPerGroup)),
-        //         make_merge_transform(make_tuple(Gs, NPerGroup))
-        //     ),
-        //     make_tuple(sequence<0, 2>{}, sequence<1, 3>{}), 
-        //     make_tuple(sequence<0>{}, sequence<1>{})
-        // );
-
-        // auto lds_view = make_tensor_view<address_space_enum::lds>(
-        //                  static_cast<ODataType*>(p_smem), lds_desc);
-
-        // constexpr auto dram_tile_encoding = tile_distribution_encoding<
-        //     sequence<>,
-        //     tuple<sequence<1, Gs, MPerGroup, 1>, 
-        //         sequence<1, Gs, NPerGroup, 1>>,
-        //     tuple<sequence<1,2>, sequence<1,2>>,
-        //     tuple<sequence<1,1>, sequence<2,2>>, 
-        //     sequence<1, 1, 2, 2>, 
-        //     sequence<0, 3, 0, 3>>{};
-
-        // constexpr auto dram_tile_distribution = make_static_tile_distribution(dram_tile_encoding);
-
-        // auto d_dram_windows = generate_tuple(
-        //     [&](auto idx) {
-        //         return make_tile_window(ds_dram_windows[idx], dram_tile_distribution);
-        //     },
-        //     number<NumDTensor>{});
-
-        // // Calculate which block in the Gs x Gs space we are located at.
-        // const auto x_space_coord = dram_tile_distribution.calculate_index();
-        // const index_t m_block = x_space_coord[0] / MPerGroup;
-        // const index_t n_block = x_space_coord[1] / NPerGroup;
-
-        // const auto current_lds_window = make_tile_window(
-        //         lds_view,
-        //         make_tuple(number<Gs * MPerGroup>{}, number<Gs * NPerGroup>{}),
-        //         {0, 0},
-        //         dram_tile_distribution);
-
-        // // Copy only the diagonal blocks.
-        // if (m_block == n_block)
-        // {
-        //     // Load static_distributed_tensor from LDS.
-        //     auto c_out_tensor = load_tile(current_lds_window);
-
-        //     // DEBUG: Print out the c_out_tensor contents for debugging
-        //     print_tensor_matrix_format(c_out_tensor, "c_out_tensor");
-        //     __syncthreads();
-
-        //     // TODO: We must move the d_dram_windows to the correct group position.
-        //     const auto ds_tensor = generate_tuple(
-        //         [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number<NumDTensor>{});
-
-        //     const auto c_ds_tiles = concat_tuple_of_reference(
-        //         tie(c_out_tensor, c_out_tensor),
-        //         generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; },
-        //                     number<NumDTensor>{}));
-
-        //     tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
-
-        //     // Move the output window to the correct position.
-        //     //printf("m_block: %d, n_block: %d \n", m_block, n_block);
-        //     //auto out_window = make_tile_window(out_dram_window, dram_tile_distribution);
-        //     //move_tile_window(out_window, {m_block * MPerGroup, 0});
-
-        //     if constexpr(MemoryOperation == memory_operation_enum::set)
-        //     {
-        //         store_tile(out_window, c_out_tensor);
-        //     }
-        //     else
-        //     {
-        //         update_tile(out_window, c_out_tensor);
-        //     }
-        // }
     }
 
     template <typename ODramWindow, typename OAccTile, typename DsDramWindows>
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index 7858a4ac42..d1c8452c5e 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -797,70 +797,8 @@ struct GroupedConvolutionBackwardWeightKernel
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(I3);
 
-        // For debugging - results in very slow compilation.
-        // if (blockIdx.x == 0 && threadIdx.x == 0)
-        // {
-        //     const auto c_block_tile_distribution = c_block_tile.get_tile_distribution();
-        //     print(c_block_tile_distribution);
-        // }
-
         EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
             c_block_window, c_block_tile, d_block_window, smem_ptr_0);
-
-        //constexpr index_t MBlockWidth = TilePartitioner::MPerBlock / GroupedConvTraitsType_::NumGroupsToMerge;
-        //constexpr index_t NBlockWidth = TilePartitioner::NPerBlock / GroupedConvTraitsType_::NumGroupsToMerge;
-
-        //Run LDS to global memory manually, one thread per convolution group.
-        // if (blockIdx.x == 0 && threadIdx.x < GroupedConvTraitsType_::NumGroupsToMerge)
-        // {
-        //     const auto group_index = threadIdx.x;
-        //     const index_t c_ptr_offset = group_index * MBlockWidth * NBlockWidth;
-        //     OutDataType* lds_data = reinterpret_cast<OutDataType*>(smem_ptr_0);
-        //     for (auto i_loc = 0; i_loc < NBlockWidth; ++i_loc)
-        //     {
-        //         const auto lds_index = (group_index * NBlockWidth + i_loc) * TilePartitioner::MPerBlock + group_index;
-        //         c_ptr[c_ptr_offset + i_loc] = lds_data[lds_index];
-        //     }
-        // }
-
-        // __syncthreads();
-        // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && threadIdx.y == 0)
-        // {
-        //     constexpr index_t Gs = GroupedConvTraitsType_::NumGroupsToMerge;
-        //     constexpr index_t NBlockWidth = TilePartitioner::NPerBlock / Gs;
-
-        //     // Print out LDS contents.
-        //     // The LDS corresponds TilePartitioner_::MPerBlock * TilePartitioner_::NPerBlock matrix.
-        //     // Print LDS contents as matrix
-        //     printf("LDS Contents (%d x %d):\n", TilePartitioner::MPerBlock, TilePartitioner::NPerBlock);
-        //     OutDataType* lds_data = reinterpret_cast<OutDataType*>(smem_ptr_0);
-            
-        //     for(int c = 0; c < Gs; ++c) {
-        //         printf("Block %d:\n", c);
-        //         for(int r = 0; r < Gs; ++r) {
-        //             printf("Row %d: ", r);
-        //             for(int n = 0; n < NBlockWidth; ++n) 
-        //             {
-        //                 int idx =  (r * NBlockWidth + n) * TilePartitioner::MPerBlock + c;
-        //                 printf("%.7f ", static_cast<float>(lds_data[idx]));
-        //             }
-        //             printf(" \n");
-        //         }
-        //         printf("\n\n");
-        //     }
-
-        //     // Print out the c_block_window contents for debugging
-        //     printf("C Ptr Contents (%d x %d):\n", TilePartitioner::MPerBlock, NBlockWidth);
-        //     for(int m = 0; m < TilePartitioner::MPerBlock; ++m) {
-        //         for(int n = 0; n < NBlockWidth; ++n) {
-        //             int idx = m * NBlockWidth + n;
-        //             printf("%.7f ", static_cast<float>(c_ptr[idx]));
-        //             if((n + 1) % NBlockWidth == 0) printf("\n  "); // Line break every NBlockWidth elements for readability
-        //         }
-        //         printf("\n");
-        //     }
-        // }
-        // __syncthreads();
     }
 
     /**