diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 42fd386f70..d79cfd5665 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -547,19 +547,18 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                         input_right_pads,
                         k_batch_initial);
 
-                const auto& a_grid_desc_kbatch_k0_m_k1 = descs_initial[I0];
                 const auto& c_grid_desc_m_n   = descs_initial[I2];
                 const auto& block_2_ctile_map = GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n, M01, N01, k_batch_initial);
-                
-                // Get the total K dimension size so that we don't make split-K value too small.
-                const auto k_size = a_grid_desc_kbatch_k0_m_k1.GetLength(I0) * K1Number * K0PerBlock;
+                const auto gemmK = get_bwd_weight_gemm_k<NDimSpatial>(a_g_n_k_wos_lengths);
 
                 // Max occupancy is calculated for a batched GEMM kernel where the batch size corresponds to the number of convolution groups.
                 // Hence, the grid is just size of the tile map.
                 const auto grid_size = block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n);
-                k_batch_ = get_k_batch_value(max_occupancy.value_, grid_size, k_size);
+                k_dim_size_ = gemmK;
+                k_batch_ = get_k_batch_value(max_occupancy.value_, grid_size);
             }
-            else {
+            else 
+            {
                 k_batch_ = split_k;
             }
             
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index a13b432862..c10c6062e1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -521,18 +521,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                 const auto& b_grid_desc_kbatch_k0_n_k1 = descs_initial[I1];
                 const index_t GemmM = a_grid_desc_kbatch_k0_m_k1.GetLength(I1);
                 const index_t GemmN = b_grid_desc_kbatch_k0_n_k1.GetLength(I1);
-                const index_t GemmK = a_grid_desc_kbatch_k0_m_k1.GetLength(I0) * a_grid_desc_kbatch_k0_m_k1.GetLength(I2);
-
-                // nullptr for output, will be set after workspace set
-                typename GridwiseGemm::Argument gemm_arg{
-                    nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, 1};
 
                 // Max occupancy is calculated for a batched GEMM kernel where the batch size corresponds to the number of convolution groups.
                 // Hence, the grid is just size of the tile map.
-                index_t gdx, gdy, gdz;
-                std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(gemm_arg.M, gemm_arg.N, 1, 1);
-                const auto grid_size = gdx * gdy * gdz;
-                k_batch_ = get_k_batch_value(max_occupancy.value_, grid_size, GemmK);
+                const auto grid_size = GridwiseGemm::Block2CTileMap::CalculateGridSize(GemmM, GemmN);
+                k_dim_size_ = get_bwd_weight_gemm_k<NDimSpatial>(a_g_n_k_wos_lengths);
+                k_batch_ = get_k_batch_value(max_occupancy.value_, grid_size);
             }
             else 
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
index 5536b8e129..ac85616e69 100644
--- a/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
@@ -10,8 +10,10 @@ namespace device {
 struct ArgumentSplitK
 {
   index_t k_batch() const { return k_batch_; }
+  index_t k_dim_size() const { return k_dim_size_; }
   protected:
-        index_t k_batch_;
+        index_t k_batch_{-1};
+        index_t k_dim_size_{-1};
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
index 719f354db3..af9d0da3cb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
@@ -4,6 +4,7 @@
 #pragma once
 #include <hip/hip_runtime.h>
 #include "ck/utility/env.hpp"
+#include "ck/utility/number.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/ck.hpp"
 
@@ -27,18 +28,15 @@ struct DeviceProperties
   int num_cu_;
 };
 
-inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, ck::index_t k_size)
+inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size)
 {
     static DeviceProperties device_properties;
     const int num_cu = device_properties.num_cu_;
     auto k_batch = 1;
-    //constexpr ck::index_t min_k_per_batch = 16;
-    //const auto max_split_k = math::integer_divide_ceil(k_size, min_k_per_batch);
 
     const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (grid_size)));
     if (optimal_split > 1)
     {
-      //k_batch = std::min(optimal_split, max_split_k);
       k_batch = optimal_split;
     }
     
@@ -46,14 +44,28 @@ inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, c
     {
       std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel:  " << max_occupancy << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Output grid size:  " << grid_size << std::endl;
-      std::cout << "[SPLIT-K AUTODEDUCE] K-dim size:  " << k_size << std::endl;
-      //std::cout << "[SPLIT-K AUTODEDUCE] Max split-k value:  " << max_split_k << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Optimal split value:  " << optimal_split << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << " for K-batch."<< std::endl;
     }
     return k_batch;
 }
 
+template <ck::index_t NDimSpatial>
+inline index_t get_bwd_weight_gemm_k(const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths)
+{
+  static constexpr auto I1 = Number<1>{};
+
+  // The input array has elements in the order: G, N, K, Do, Ho, Wo
+  // GemmK = N * Do * Ho * Wo for the BWD weight pass.
+  constexpr index_t spatial_offset = 3; 
+  const index_t DoHoWo = std::accumulate(begin(a_g_n_k_wos_lengths) + spatial_offset,
+                                      end(a_g_n_k_wos_lengths),
+                                      index_t{1},
+                                      std::multiplies<>{});
+  const auto gemmK = a_g_n_k_wos_lengths[I1] * DoHoWo;
+  return gemmK;
+}
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 15ce6ff025..b518f635b8 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -113,6 +113,16 @@ struct PerfResults
         return ss.str();
     }
 
+    void set_k_dim_size(ck::index_t k_dim_size)
+    {
+        if (k_dim_size_ > 0 && k_dim_size != k_dim_size_)
+        {
+            std::cerr << "Error: k_dim_size cannot be set multiple times. Old value " << k_dim_size_ << ". New value " << k_dim_size << std::endl;
+            exit(EXIT_FAILURE);
+        }
+        k_dim_size_ = k_dim_size;
+    }
+
     // Global best results
     std::string best_op_name_;
     float best_avg_time_      = 0;
@@ -135,6 +145,9 @@ struct PerfResults
     float opt_split_k_gb_per_sec_    = 0;
     ck::index_t opt_split_k_best_arg_ = 1;
 
+    // K-dim size
+    ck::index_t k_dim_size_ = -1;
+
     std::vector<std::tuple<std::string, ck::index_t, float>> ranking_;
 };
 
@@ -156,6 +169,7 @@ void write_perf_results_to_file(const PerfResults& perf_results_global,
         }
         file << res.opt_split_k_avg_time_ << separator
              << res.opt_split_k_best_arg_ << separator
+             << res.k_dim_size_ << separator
              << rank << separator
              << total_num;
     };
@@ -364,7 +378,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     range_copy(conv_param.input_left_pads_, begin(input_left_pads));
     range_copy(conv_param.input_right_pads_, begin(input_right_pads));
 
-    std::vector<ck::index_t> split_k_list = {/*Split-k parameter autodeduction*/-1, 1, 2, 4, 8, 16, 32, 64, 128};
+    std::vector<ck::index_t> split_k_list = {/*Split-k parameter autodeduction*/-1, 1, 2, 4, 8, 16, 32, 64, 128, 256};
     bool profile_all = true;
     if(split_k != "all")
     {
@@ -421,6 +435,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
             if (split_k_arg)
             {
                 split_k_arg_value = split_k_arg->k_batch();
+                const auto k_dim_size = split_k_arg->k_dim_size();
+                if (k_dim_size > 0)
+                {
+                    perf_results_local.set_k_dim_size(k_dim_size);
+                    perf_results_global.set_k_dim_size(k_dim_size);
+                }
                 supports_split_k_optimization = true;
             }
 
@@ -587,6 +607,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                     << std::get<0>(perf_results_global.get_ranking(perf_results_global.opt_split_k_best_op_name_, perf_results_global.opt_split_k_best_arg_))
                     << " / " << std::get<1>(perf_results_global.get_ranking(perf_results_global.opt_split_k_best_op_name_, perf_results_global.opt_split_k_best_arg_))
                     << std::endl;
+            std::cerr << "K-dim size: " << perf_results_global.k_dim_size_ << std::endl;
 
             write_perf_results_to_file(perf_results_global, perf_results_list);
         }