diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 1207e4ec65..2ea08300ba 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -427,13 +427,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     {
         MaximumActiveBlocksPerMultiprocessor()
         {
-            constexpr size_t dynSharedMemPerBlk = 0;
-            constexpr size_t ldsMemPerBlk = GridwiseGemm::GetSharedMemoryNumberOfByte();
-            if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-            {
-                std::cout << "[SPLIT-K AUTODEDUCE] Dynamic shared memory per block: " << dynSharedMemPerBlk << " bytes" << std::endl;
-                std::cout << "[SPLIT-K AUTODEDUCE] LDS memory per block: " << ldsMemPerBlk << " bytes" << std::endl;
-            }
+            constexpr int dynamic_smem_size = 0;
             int max_occupancy = 0;
             hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
                                 &max_occupancy,
@@ -450,9 +444,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                     remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>, 
                                     remove_reference_t<DeviceOp::Block2CTileMap>,
                                     ComputePtrOffsetOfStridedBatch<>,
-                                    true>, // TODO: Do we need to test both true/false for HasMainKBlockLoop?
+                                    false>, // Both true/false give the same occupancy.
                                 BlockSize,
-                                dynSharedMemPerBlk));
+                                dynamic_smem_size));
             value_ = std::max(1, max_occupancy);
         }
         int value_;
@@ -553,14 +547,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                         input_right_pads,
                         k_batch_initial);
 
-                const auto& a_grid_desc_kbatch_k0_m_k1 = descs_initial[I0];
                 const auto& c_grid_desc_m_n   = descs_initial[I2];
                 const auto& block_2_ctile_map = GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n, M01, N01, k_batch_initial);
-                const auto grid_size = block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n);
-                const auto k_size = a_grid_desc_kbatch_k0_m_k1.GetLength(I0) * a_grid_desc_kbatch_k0_m_k1.GetLength(I1);
                 
-                //const auto multiplier = static_cast<ck::index_t>(-split_k);
-                k_batch_ = get_k_batch_value(max_occupancy.value_, grid_size, k_size, Conv_G_/*, multiplier*/);
+                // Max occupancy is calculated for a batched GEMM kernel where the batch size corresponds to the number of convolution groups.
+                // Hence, the grid is just size of the tile map.
+                const auto grid_size = block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n);
+                k_batch_ = get_k_batch_value(max_occupancy.value_, grid_size);
             }
             else {
                 k_batch_ = split_k;
diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
index 06b370daa1..f7188d7b93 100644
--- a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
@@ -27,43 +27,14 @@ struct DeviceProperties
   int num_cu_;
 };
 
-inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, ck::index_t K_size, ck::index_t conv_G /*, ck::index_t multiplier*/)
+inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size)
 {
     static DeviceProperties device_properties;
-    //constexpr ck::index_t default_batch_size = 512;
-    //constexpr ck::index_t min_batch_size = 8192;
-
     const int num_cu = device_properties.num_cu_;
-    // auto target_batch_size = static_cast<ck::index_t>(ck::EnvValue(CK_ENV(CK_SPLIT_K_BATCH_SIZE)));
-    // if (target_batch_size < min_batch_size)
-    // {
-    //   target_batch_size = default_batch_size;
-    // }
-
-    // The optimal split is an integer multiple of (max_occupancy * num_cu) / (1.0 * grid_size * conv_G).
-    // Here we take the integer to be conv_G, i.e., the number of groups.
-    // The number is floored to ensure that we do not exceed the maximum capacity of compute units, i.e, 
-    // we prefer to (N-eps) * max_capacity rather than (N+eps) * max_capacity because the latter leads to
-    // using only eps fraction of capacity on the last wave.
-    // const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (1.0 * grid_size)));
-    // auto k_batch = 1;
-    // if (optimal_split > 0 && K_size > target_batch_size)
-    // {
-    //   //The optimal value of k_batch is a multiple of the optimal_split.
-    //   //We need to find the optimal number K values per batch - this gives the optimal k_batch value.
-    //   k_batch = optimal_split;
-    //   const auto current_batch_size = math::integer_divide_ceil(K_size, k_batch);
-    //   if (current_batch_size > target_batch_size)
-    //   {
-    //     // If the current batch size is larger than the target batch size, we need to increase k_batch.
-    //     const ck::index_t multiplier = std::max(1, math::integer_divide_ceil(K_size, target_batch_size * optimal_split));
-    //     k_batch = optimal_split * multiplier;
-    //   }
-    // }
-
     auto k_batch = 1;
+
     constexpr ck::index_t num_waves = 1;
-    const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (num_waves * grid_size * conv_G)));
+    const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (num_waves * grid_size)));
     if (optimal_split > 1)
     {
       k_batch = optimal_split;
@@ -73,10 +44,7 @@ inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, c
     {
       std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel:  " << max_occupancy << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Output grid size (M tiles x N tiles):  " << grid_size << std::endl;
-      std::cout << "[SPLIT-K AUTODEDUCE] K-dim size:  " << K_size << std::endl;
-      std::cout << "[SPLIT-K AUTODEDUCE] Conv groups:  " << conv_G << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Optimal split value:  " << optimal_split << std::endl;
-      //std::cout << "[SPLIT-K AUTODEDUCE] Target batch size:  " << target_batch_size << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << " for K-batch."<< std::endl;
     }
     return k_batch;