diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp
index 316dcadb7b..9d528bf737 100644
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -131,6 +131,9 @@ template <ck::index_t NDimSpatial,
           typename WeiElementOp,
           typename OutElementOp,
           typename DeviceConvNDFwdInstance,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
           typename ComputeDataType = OutDataType>
 bool run_grouped_conv_fwd(int do_verification,
                           int init_method,
@@ -283,31 +286,25 @@ bool run_grouped_conv_fwd(int do_verification,
         DeviceMem out_device_ref_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
         out_device_ref_buf.SetZero();
 
-        // Extract dimensions using helper function
-        ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
-
-        // Launch GPU reference kernel
-        constexpr ck::index_t block_size     = 256;
-        const ck::long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K;
-        const ck::index_t grid_size          = (output_length + block_size - 1) / block_size;
-
-        auto gpu_ref_kernel = ck::ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
-                                                                        WeiDataType,
-                                                                        OutDataType,
-                                                                        ComputeDataType,
-                                                                        InElementOp,
-                                                                        WeiElementOp,
-                                                                        OutElementOp>;
-
-        gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
+        // Call GPU reference with ConvParam directly, using the correct layout types
+        ck::ref::naive_conv_fwd<InLayout,
+                                WeiLayout,
+                                OutLayout,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementOp,
+                                WeiElementOp,
+                                OutElementOp>(
             reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
             reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
             reinterpret_cast<OutDataType*>(out_device_ref_buf.GetDeviceBuffer()),
-            dims);
+            conv_param);
 
         HIP_CHECK_ERROR(hipDeviceSynchronize());
 
-        std::cout << "GPU reference kernel completed successfully, copying results..." << std::endl;
+        std::cout << "GPU reference function completed successfully, copying results..."
+                  << std::endl;
 
         // Copy GPU reference result to host
         out_device_ref_buf.FromDevice(out_host.mData.data());
diff --git a/example/09_convnd_fwd/run_convnd_fwd_example.inc b/example/09_convnd_fwd/run_convnd_fwd_example.inc
index 4c394821a8..33eddaf0d5 100644
--- a/example/09_convnd_fwd/run_convnd_fwd_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc
@@ -12,7 +12,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
 {
     print_helper_msg();
 
-    int do_verification = 1; // 0=no, 1=CPU, 2=GPU
+    int do_verification = 2; // 0=no, 1=CPU, 2=GPU
     int init_method     = 1;
     bool time_kernel    = false;
 
@@ -71,6 +71,9 @@ bool run_convnd_fwd_example(int argc, char* argv[])
             WeiElementOp,
             OutElementOp,
             DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>,
+            InLayout,
+            WeiLayout,
+            OutLayout,
             ComputeDataType>(do_verification,
                              init_method,
                              time_kernel,
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
index f278107c9e..c8f1b1459e 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
@@ -18,7 +18,8 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
 #include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
-#include "ck_tile/host/hip_check_error.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
 
 using ::ck::DeviceMem;
 using ::ck::HostTensorDescriptor;
@@ -81,7 +82,10 @@ template <ck::index_t NDimSpatial,
           typename InElementOp,
           typename WeiElementOp,
           typename OutElementOp,
-          typename DeviceConvNdBwdDataInstance>
+          typename DeviceConvNdBwdDataInstance,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
 int run_conv_bwd_data(int do_verification,
                       int init_method,
                       bool time_kernel,
@@ -225,50 +229,52 @@ int run_conv_bwd_data(int do_verification,
     }
     else if(do_verification == 2)
     {
-        // GPU verification
+        // GPU verification using naive GPU reference
         std::cout << "Running GPU verification..." << std::endl;
 
+        // Allocate and ZERO GPU memory for reference input
         DeviceMem in_device_ref_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
         in_device_ref_buf.SetZero();
 
-        // Extract dimensions using helper function
-        ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
-
-        constexpr ck::index_t block_size    = 256;
-        const ck::long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C;
-        const ck::index_t grid_size         = (input_length + block_size - 1) / block_size;
-
-        auto gpu_ref_kernel = ck::ref::naive_conv_bwd_data_ndhwc_kzyxc_ndhwk<InDataType,
-                                                                             WeiDataType,
-                                                                             OutDataType,
-                                                                             float,
-                                                                             InElementOp,
-                                                                             WeiElementOp,
-                                                                             OutElementOp>;
-
-        gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
+        // Call GPU reference with ConvParam directly, using the correct layout types
+        ck::ref::naive_conv_bwd_data<InLayout,
+                                     WeiLayout,
+                                     OutLayout,
+                                     InDataType,
+                                     WeiDataType,
+                                     OutDataType,
+                                     InElementOp,
+                                     WeiElementOp,
+                                     OutElementOp>(
             reinterpret_cast<InDataType*>(in_device_ref_buf.GetDeviceBuffer()),
             reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
             reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            dims);
+            conv_param,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
 
         HIP_CHECK_ERROR(hipDeviceSynchronize());
 
-        std::cout << "GPU reference kernel completed, copying results..." << std::endl;
+        std::cout << "GPU reference function completed successfully, copying results..."
+                  << std::endl;
 
-        // Copy GPU reference result
+        // Copy GPU reference result to host
         Tensor<InDataType> in_gpu_ref(in_host.mDesc);
         in_device_ref_buf.FromDevice(in_gpu_ref.mData.data());
 
-        // Copy optimized kernel result
+        // Copy GPU kernel result to host
         in_device_buf.FromDevice(in_device.mData.data());
 
+        std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl;
+
         // Compare: Optimized kernel result vs GPU reference result
         bool pass = ck::utils::check_err(in_device,
                                          in_gpu_ref,
                                          "Error: Incorrect results!",
                                          get_rtol<InDataType, float>(),
                                          get_atol<InDataType, float>());
+
         std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
 
         return pass ? 0 : 1;
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
index eaae245db1..73aa7c50e3 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
@@ -92,16 +92,19 @@ int main(int argc, char* argv[])
                                  InElementOp,
                                  WeiElementOp,
                                  OutElementOp,
-                                 DeviceConvNdBwdDataInstance<1>>(do_verification,
-                                                                 init_method,
-                                                                 time_kernel,
-                                                                 conv_param,
-                                                                 in_g_n_c_wis_desc,
-                                                                 wei_g_k_c_xs_desc,
-                                                                 out_g_n_k_wos_desc,
-                                                                 in_element_op,
-                                                                 wei_element_op,
-                                                                 out_element_op);
+                                 DeviceConvNdBwdDataInstance<1>,
+                                 InLayout,
+                                 WeiLayout,
+                                 OutLayout>(do_verification,
+                                            init_method,
+                                            time_kernel,
+                                            conv_param,
+                                            in_g_n_c_wis_desc,
+                                            wei_g_k_c_xs_desc,
+                                            out_g_n_k_wos_desc,
+                                            in_element_op,
+                                            wei_element_op,
+                                            out_element_op);
     }
     else if(conv_param.num_dim_spatial_ == 2)
     {
@@ -128,16 +131,19 @@ int main(int argc, char* argv[])
                                  InElementOp,
                                  WeiElementOp,
                                  OutElementOp,
-                                 DeviceConvNdBwdDataInstance<2>>(do_verification,
-                                                                 init_method,
-                                                                 time_kernel,
-                                                                 conv_param,
-                                                                 in_g_n_c_wis_desc,
-                                                                 wei_g_k_c_xs_desc,
-                                                                 out_g_n_k_wos_desc,
-                                                                 in_element_op,
-                                                                 wei_element_op,
-                                                                 out_element_op);
+                                 DeviceConvNdBwdDataInstance<2>,
+                                 InLayout,
+                                 WeiLayout,
+                                 OutLayout>(do_verification,
+                                            init_method,
+                                            time_kernel,
+                                            conv_param,
+                                            in_g_n_c_wis_desc,
+                                            wei_g_k_c_xs_desc,
+                                            out_g_n_k_wos_desc,
+                                            in_element_op,
+                                            wei_element_op,
+                                            out_element_op);
     }
     else if(conv_param.num_dim_spatial_ == 3)
     {
@@ -164,16 +170,19 @@ int main(int argc, char* argv[])
                                  InElementOp,
                                  WeiElementOp,
                                  OutElementOp,
-                                 DeviceConvNdBwdDataInstance<3>>(do_verification,
-                                                                 init_method,
-                                                                 time_kernel,
-                                                                 conv_param,
-                                                                 in_g_n_c_wis_desc,
-                                                                 wei_g_k_c_xs_desc,
-                                                                 out_g_n_k_wos_desc,
-                                                                 in_element_op,
-                                                                 wei_element_op,
-                                                                 out_element_op);
+                                 DeviceConvNdBwdDataInstance<3>,
+                                 InLayout,
+                                 WeiLayout,
+                                 OutLayout>(do_verification,
+                                            init_method,
+                                            time_kernel,
+                                            conv_param,
+                                            in_g_n_c_wis_desc,
+                                            wei_g_k_c_xs_desc,
+                                            out_g_n_k_wos_desc,
+                                            in_element_op,
+                                            wei_element_op,
+                                            out_element_op);
     }
 
     return 0;
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
index 3d1c70bea0..9f83620c94 100644
--- a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
@@ -119,16 +119,19 @@ int main(int argc, char* argv[])
                                  InElementOp,
                                  WeiElementOp,
                                  OutElementOp,
-                                 DeviceConvNdBwdDataInstance<1>>(do_verification,
-                                                                 init_method,
-                                                                 time_kernel,
-                                                                 conv_param,
-                                                                 in_g_n_c_wis_desc,
-                                                                 wei_g_k_c_xs_desc,
-                                                                 out_g_n_k_wos_desc,
-                                                                 in_element_op,
-                                                                 wei_element_op,
-                                                                 out_element_op);
+                                 DeviceConvNdBwdDataInstance<1>,
+                                 InLayout,
+                                 WeiLayout,
+                                 OutLayout>(do_verification,
+                                            init_method,
+                                            time_kernel,
+                                            conv_param,
+                                            in_g_n_c_wis_desc,
+                                            wei_g_k_c_xs_desc,
+                                            out_g_n_k_wos_desc,
+                                            in_element_op,
+                                            wei_element_op,
+                                            out_element_op);
     }
     else if(conv_param.num_dim_spatial_ == 2)
     {
@@ -155,16 +158,19 @@ int main(int argc, char* argv[])
                                  InElementOp,
                                  WeiElementOp,
                                  OutElementOp,
-                                 DeviceConvNdBwdDataInstance<2>>(do_verification,
-                                                                 init_method,
-                                                                 time_kernel,
-                                                                 conv_param,
-                                                                 in_g_n_c_wis_desc,
-                                                                 wei_g_k_c_xs_desc,
-                                                                 out_g_n_k_wos_desc,
-                                                                 in_element_op,
-                                                                 wei_element_op,
-                                                                 out_element_op);
+                                 DeviceConvNdBwdDataInstance<2>,
+                                 InLayout,
+                                 WeiLayout,
+                                 OutLayout>(do_verification,
+                                            init_method,
+                                            time_kernel,
+                                            conv_param,
+                                            in_g_n_c_wis_desc,
+                                            wei_g_k_c_xs_desc,
+                                            out_g_n_k_wos_desc,
+                                            in_element_op,
+                                            wei_element_op,
+                                            out_element_op);
     }
     else if(conv_param.num_dim_spatial_ == 3)
     {
@@ -191,16 +197,19 @@ int main(int argc, char* argv[])
                                  InElementOp,
                                  WeiElementOp,
                                  OutElementOp,
-                                 DeviceConvNdBwdDataInstance<3>>(do_verification,
-                                                                 init_method,
-                                                                 time_kernel,
-                                                                 conv_param,
-                                                                 in_g_n_c_wis_desc,
-                                                                 wei_g_k_c_xs_desc,
-                                                                 out_g_n_k_wos_desc,
-                                                                 in_element_op,
-                                                                 wei_element_op,
-                                                                 out_element_op);
+                                 DeviceConvNdBwdDataInstance<3>,
+                                 InLayout,
+                                 WeiLayout,
+                                 OutLayout>(do_verification,
+                                            init_method,
+                                            time_kernel,
+                                            conv_param,
+                                            in_g_n_c_wis_desc,
+                                            wei_g_k_c_xs_desc,
+                                            out_g_n_k_wos_desc,
+                                            in_element_op,
+                                            wei_element_op,
+                                            out_element_op);
     }
 
     return 0;
diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
index 8cc9f582eb..70c43b81b3 100644
--- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -149,55 +149,53 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
     }
     else if(config.do_verification == 2)
     {
-        // GPU verification (only supports G=1, standard convolution)
-        if(conv_param.G_ != 1)
-        {
-            std::cout << "GPU verification only supports G=1 (standard convolution)" << std::endl;
-            std::cout << "Current G=" << conv_param.G_ << " not supported." << std::endl;
-            std::cout << "Use do_verification=1 for CPU verification with grouped convolution."
-                      << std::endl;
-            return true;
-        }
-
-        std::cout << "Running GPU verification (G=1)..." << std::endl;
+        // GPU verification using naive GPU reference
+        std::cout << "Running GPU verification..." << std::endl;
 
+        // Allocate and ZERO GPU memory for reference weights
         DeviceMem wei_device_ref_buf(sizeof(WeiDataType) *
                                      wei_device_result.mDesc.GetElementSpaceSize());
         wei_device_ref_buf.SetZero();
 
-        // Extract dimensions using helper function (G=1, standard convolution)
-        ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial, false);
+        // Call GPU reference function with ConvParam and layout types
+        using InLayout  = InputLayout<NDimSpatial>;
+        using WeiLayout = WeightLayout<NDimSpatial>;
+        using OutLayout = OutputLayout<NDimSpatial>;
 
-        constexpr ck::index_t block_size     = 256;
-        const ck::long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C;
-        const ck::index_t grid_size          = (weight_length + block_size - 1) / block_size;
-
-        auto gpu_ref_kernel = ck::ref::naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk<InDataType,
-                                                                               WeiDataType,
-                                                                               OutDataType,
-                                                                               float,
-                                                                               InElementOp,
-                                                                               WeiElementOp,
-                                                                               OutElementOp>;
-
-        gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
+        ck::ref::naive_conv_bwd_weight<InLayout,
+                                       WeiLayout,
+                                       OutLayout,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InElementOp,
+                                       WeiElementOp,
+                                       OutElementOp>(
             reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
             reinterpret_cast<WeiDataType*>(wei_device_ref_buf.GetDeviceBuffer()),
             reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            dims);
+            conv_param);
 
         HIP_CHECK_ERROR(hipDeviceSynchronize());
 
-        std::cout << "GPU reference kernel completed, copying results..." << std::endl;
+        std::cout << "GPU reference function completed successfully, copying results..."
+                  << std::endl;
 
+        // Copy GPU reference result to host
         wei_device_ref_buf.FromDevice(wei_host_result.mData.data());
+
+        // Copy GPU kernel result to host
         wei_device_buf.FromDevice(wei_device_result.mData.data());
 
+        std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl;
+
+        // Compare: Optimized kernel result vs GPU reference result
         bool pass = ck::utils::check_err(wei_device_result.mData,
                                          wei_host_result.mData,
                                          "Error: Incorrect results!",
                                          get_rtol<WeiDataType, float>(),
                                          get_atol<WeiDataType, float>());
+
         std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
 
         return pass;
diff --git a/example/test_old_ck_gpu_reference.cpp b/example/test_old_ck_gpu_reference.cpp
deleted file mode 100644
index 9f12eaea4d..0000000000
--- a/example/test_old_ck_gpu_reference.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-// Standalone test program for Old CK GPU references
-// Tests naive_conv_fwd (existing) and future backward ops
-
-#include <iostream>
-#include <vector>
-#include <numeric>
-#include <algorithm>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-
-// CPU reference for validation
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
-
-// GPU reference (OLD CK - already exists!)
-#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
-
-using namespace ck;
-
-template <index_t NDimSpatial>
-struct ConvParams
-{
-    index_t N, K, C;
-    std::vector<index_t> input_spatial;
-    std::vector<index_t> filter_spatial;
-    std::vector<index_t> output_spatial;
-    std::vector<index_t> strides;
-    std::vector<index_t> dilations;
-    std::vector<index_t> pads;
-};
-
-template <index_t NDimSpatial, typename InDataType, typename WeiDataType, typename OutDataType>
-bool test_conv_forward_gpu_ref(const ConvParams<NDimSpatial>& params, const std::string& test_name)
-{
-    std::cout << "[TEST] " << test_name << std::endl;
-
-    // Calculate dimensions
-    const index_t N = params.N;
-    const index_t K = params.K;
-    const index_t C = params.C;
-
-    // Create tensor descriptors (NDHWC layout for old CK)
-    std::vector<index_t> in_lengths = {N};
-    for(auto d : params.input_spatial)
-        in_lengths.push_back(d);
-    in_lengths.push_back(C);
-
-    std::vector<index_t> wei_lengths = {K};
-    for(auto d : params.filter_spatial)
-        wei_lengths.push_back(d);
-    wei_lengths.push_back(C);
-
-    std::vector<index_t> out_lengths = {N};
-    for(auto d : params.output_spatial)
-        out_lengths.push_back(d);
-    out_lengths.push_back(K);
-
-    // Create host tensors
-    Tensor<InDataType> input(in_lengths);
-    Tensor<WeiDataType> weight(wei_lengths);
-    Tensor<OutDataType> output_gpu(out_lengths);
-    Tensor<OutDataType> output_ref(out_lengths);
-
-    // Initialize with random data
-    input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-    weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-
-    // Allocate device memory
-    DeviceMem input_dev(input.mData.size() * sizeof(InDataType));
-    DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType));
-    DeviceMem output_dev(output_gpu.mData.size() * sizeof(OutDataType));
-
-    // Copy to device
-    input_dev.ToDevice(input.mData.data());
-    weight_dev.ToDevice(weight.mData.data());
-
-    // Run CPU reference for validation
-    auto ref_conv =
-        tensor_operation::host::ReferenceConvFwd<NDimSpatial,
-                                                 InDataType,
-                                                 WeiDataType,
-                                                 OutDataType,
-                                                 tensor_operation::element_wise::PassThrough,
-                                                 tensor_operation::element_wise::PassThrough,
-                                                 tensor_operation::element_wise::PassThrough>();
-
-    auto ref_invoker = ref_conv.MakeInvoker();
-    auto ref_arg     = ref_conv.MakeArgument(input.mData.data(),
-                                         weight.mData.data(),
-                                         output_ref.mData.data(),
-                                         N,
-                                         K,
-                                         C,
-                                         params.input_spatial,
-                                         params.filter_spatial,
-                                         params.output_spatial,
-                                         params.strides,
-                                         params.dilations,
-                                         params.pads,
-                                         params.pads,
-                                             {},
-                                             {},
-                                             {});
-
-    ref_invoker.Run(ref_arg);
-
-    // Run GPU reference (OLD CK)
-    using InElementOp  = tensor_operation::element_wise::PassThrough;
-    using WeiElementOp = tensor_operation::element_wise::PassThrough;
-    using OutElementOp = tensor_operation::element_wise::PassThrough;
-
-    constexpr index_t block_size = 256;
-
-    // Extract dimensions based on NDimSpatial
-    index_t Di = 1, Hi = 1, Wi = 1;
-    index_t Z = 1, Y = 1, X = 1;
-    index_t Do = 1, Ho = 1, Wo = 1;
-    index_t stride_z = 1, stride_y = 1, stride_x = 1;
-    index_t dilation_z = 1, dilation_y = 1, dilation_x = 1;
-    index_t pad_z = 0, pad_y = 0, pad_x = 0;
-
-    if(NDimSpatial == 1)
-    {
-        Wi         = params.input_spatial[0];
-        X          = params.filter_spatial[0];
-        Wo         = params.output_spatial[0];
-        stride_x   = params.strides[0];
-        dilation_x = params.dilations[0];
-        pad_x      = params.pads[0];
-    }
-    else if(NDimSpatial == 2)
-    {
-        Hi         = params.input_spatial[0];
-        Wi         = params.input_spatial[1];
-        Y          = params.filter_spatial[0];
-        X          = params.filter_spatial[1];
-        Ho         = params.output_spatial[0];
-        Wo         = params.output_spatial[1];
-        stride_y   = params.strides[0];
-        stride_x   = params.strides[1];
-        dilation_y = params.dilations[0];
-        dilation_x = params.dilations[1];
-        pad_y      = params.pads[0];
-        pad_x      = params.pads[1];
-    }
-    else if(NDimSpatial == 3)
-    {
-        Di         = params.input_spatial[0];
-        Hi         = params.input_spatial[1];
-        Wi         = params.input_spatial[2];
-        Z          = params.filter_spatial[0];
-        Y          = params.filter_spatial[1];
-        X          = params.filter_spatial[2];
-        Do         = params.output_spatial[0];
-        Ho         = params.output_spatial[1];
-        Wo         = params.output_spatial[2];
-        stride_z   = params.strides[0];
-        stride_y   = params.strides[1];
-        stride_x   = params.strides[2];
-        dilation_z = params.dilations[0];
-        dilation_y = params.dilations[1];
-        dilation_x = params.dilations[2];
-        pad_z      = params.pads[0];
-        pad_y      = params.pads[1];
-        pad_x      = params.pads[2];
-    }
-
-    // Launch GPU reference kernel
-    const long_index_t output_length = N * Do * Ho * Wo * K;
-    const index_t grid_size          = (output_length + block_size - 1) / block_size;
-
-    hipLaunchKernelGGL(ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
-                                                             WeiDataType,
-                                                             OutDataType,
-                                                             float,
-                                                             InElementOp,
-                                                             WeiElementOp,
-                                                             OutElementOp>,
-                       dim3(grid_size),
-                       dim3(block_size),
-                       0,
-                       nullptr,
-                       reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
-                       reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
-                       reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
-                       N,
-                       K,
-                       C,
-                       Di,
-                       Hi,
-                       Wi,
-                       Z,
-                       Y,
-                       X,
-                       Do,
-                       Ho,
-                       Wo,
-                       stride_z,
-                       stride_y,
-                       stride_x,
-                       dilation_z,
-                       dilation_y,
-                       dilation_x,
-                       pad_z,
-                       pad_y,
-                       pad_x);
-
-    hipDeviceSynchronize();
-
-    // Copy result back
-    output_dev.FromDevice(output_gpu.mData.data());
-
-    // Compare GPU ref vs CPU ref
-    bool pass = check_err(output_gpu.mData, output_ref.mData, "GPU vs CPU ref", 1e-3, 1e-3);
-
-    std::cout << "  Result: " << (pass ? "✅ PASS" : "❌ FAIL") << std::endl;
-
-    return pass;
-}
-
-int main(int argc, char* argv[])
-{
-    std::cout << "========================================" << std::endl;
-    std::cout << "Old CK GPU Reference Test Program" << std::endl;
-    std::cout << "========================================" << std::endl;
-    std::cout << std::endl;
-
-    int passed = 0;
-    int failed = 0;
-
-    // Test 1: 2D Conv, FP16, Small
-    {
-        ConvParams<2> params;
-        params.N              = 2;
-        params.K              = 8;
-        params.C              = 8;
-        params.input_spatial  = {7, 7};
-        params.filter_spatial = {3, 3};
-        params.output_spatial = {5, 5};
-        params.strides        = {1, 1};
-        params.dilations      = {1, 1};
-        params.pads           = {0, 0};
-
-        if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Small"))
-            passed++;
-        else
-            failed++;
-    }
-
-    // Test 2: 2D Conv, FP32, Medium
-    {
-        ConvParams<2> params;
-        params.N              = 4;
-        params.K              = 16;
-        params.C              = 16;
-        params.input_spatial  = {14, 14};
-        params.filter_spatial = {3, 3};
-        params.output_spatial = {12, 12};
-        params.strides        = {1, 1};
-        params.dilations      = {1, 1};
-        params.pads           = {0, 0};
-
-        if(test_conv_forward_gpu_ref<2, float, float, float>(params, "2D-FP32-Medium"))
-            passed++;
-        else
-            failed++;
-    }
-
-    // Test 3: 1D Conv, FP16
-    {
-        ConvParams<1> params;
-        params.N              = 2;
-        params.K              = 8;
-        params.C              = 8;
-        params.input_spatial  = {16};
-        params.filter_spatial = {3};
-        params.output_spatial = {14};
-        params.strides        = {1};
-        params.dilations      = {1};
-        params.pads           = {0};
-
-        if(test_conv_forward_gpu_ref<1, half_t, half_t, half_t>(params, "1D-FP16"))
-            passed++;
-        else
-            failed++;
-    }
-
-    // Test 4: 3D Conv, FP16, Small
-    {
-        ConvParams<3> params;
-        params.N              = 1;
-        params.K              = 8;
-        params.C              = 8;
-        params.input_spatial  = {5, 5, 5};
-        params.filter_spatial = {3, 3, 3};
-        params.output_spatial = {3, 3, 3};
-        params.strides        = {1, 1, 1};
-        params.dilations      = {1, 1, 1};
-        params.pads           = {0, 0, 0};
-
-        if(test_conv_forward_gpu_ref<3, half_t, half_t, half_t>(params, "3D-FP16-Small"))
-            passed++;
-        else
-            failed++;
-    }
-
-    // Test 5: 2D Conv with stride
-    {
-        ConvParams<2> params;
-        params.N              = 2;
-        params.K              = 8;
-        params.C              = 8;
-        params.input_spatial  = {8, 8};
-        params.filter_spatial = {3, 3};
-        params.output_spatial = {3, 3};
-        params.strides        = {2, 2};
-        params.dilations      = {1, 1};
-        params.pads           = {0, 0};
-
-        if(test_conv_forward_gpu_ref<2, half_t, half_t, half_t>(params, "2D-FP16-Stride2"))
-            passed++;
-        else
-            failed++;
-    }
-
-    std::cout << std::endl;
-    std::cout << "========================================" << std::endl;
-    std::cout << "SUMMARY" << std::endl;
-    std::cout << "========================================" << std::endl;
-    std::cout << "Total:  " << (passed + failed) << std::endl;
-    std::cout << "Passed: " << passed << " ✅" << std::endl;
-    std::cout << "Failed: " << failed << std::endl;
-    std::cout << std::endl;
-
-    if(failed == 0)
-    {
-        std::cout << "🎉 ALL TESTS PASSED!" << std::endl;
-        std::cout << "Old CK Forward GPU Reference: WORKING ✅" << std::endl;
-        return 0;
-    }
-    else
-    {
-        std::cout << "❌ SOME TESTS FAILED" << std::endl;
-        return 1;
-    }
-}
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
index 2cdb70e2a2..a8e5b18ed8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
@@ -7,11 +7,12 @@
 #include <iostream>
 #include <memory>
 #include <sstream>
-#include "conv_util.hpp"
-#include "device.hpp"
-#include "device_conv_fwd.hpp"
-#include "common_header.hpp"
-#include "naive_conv_fwd_gpu.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -26,7 +27,16 @@ template <typename InDataType,
           typename WeiElementwiseOperation,
           typename OutElementwiseOperation>
 struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K
-    : public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
+    : public DeviceConvFwd<3,
+                           ck::tensor_layout::convolution::NDHWC,
+                           ck::tensor_layout::convolution::KZYXC,
+                           ck::tensor_layout::convolution::NDHWK,
+                           InDataType,
+                           WeiDataType,
+                           OutDataType,
+                           InElementwiseOperation,
+                           WeiElementwiseOperation,
+                           OutElementwiseOperation>
 
 {
     using DeviceOp = DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K;
@@ -57,6 +67,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
                  WeiElementwiseOperation wei_element_op,
                  OutElementwiseOperation out_element_op)
             : params_{3,
+                      1, // G (group count, always 1 for non-grouped)
                       N,
                       K,
                       C,
@@ -78,7 +89,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
         }
 
         //  private:
-        utils::conv::ConvParams params_;
+        utils::conv::ConvParam params_;
         std::vector<index_t> out_spatial_lengths_;
 
         const InDataType* p_in_;
@@ -97,46 +108,28 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
 
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            const auto naive_conv3d_fwd =
-                ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
-                                                      WeiDataType,
-                                                      OutDataType,
-                                                      AccDataType,
-                                                      InElementwiseOperation,
-                                                      WeiElementwiseOperation,
-                                                      OutElementwiseOperation>;
+            using InLayout  = ck::tensor_layout::convolution::GNCDHW;
+            using WeiLayout = ck::tensor_layout::convolution::GKCZYX;
+            using OutLayout = ck::tensor_layout::convolution::GNKDHW;
 
-            float ave_time = launch_and_time_kernel(stream_config,
-                                                    naive_conv3d_fwd,
-                                                    dim3(256),
-                                                    dim3(256),
-                                                    0,
-                                                    arg.p_in_,
-                                                    arg.p_wei_,
-                                                    arg.p_out_,
-                                                    arg.N_,
-                                                    arg.K_,
-                                                    arg.C_,
-                                                    arg.in_spatial_lengths_[0],
-                                                    arg.in_spatial_lengths_[1],
-                                                    arg.in_spatial_lengths_[2],
-                                                    arg.filter_spatial_lengths_[0],
-                                                    arg.filter_spatial_lengths_[1],
-                                                    arg.filter_spatial_lengths_[2],
-                                                    arg.out_spatial_lengths_[0],
-                                                    arg.out_spatial_lengths_[1],
-                                                    arg.out_spatial_lengths_[2],
-                                                    arg.conv_filter_strides_[0],
-                                                    arg.conv_filter_strides_[1],
-                                                    arg.conv_filter_strides_[2],
-                                                    arg.conv_filter_dilations_[0],
-                                                    arg.conv_filter_dilations_[1],
-                                                    arg.conv_filter_dilations_[2],
-                                                    arg.in_left_pads_[0],
-                                                    arg.in_left_pads_[1],
-                                                    arg.in_left_pads_[2]);
-
-            return ave_time;
+            // Use simplified ConvParam-based API
+            ref::naive_conv_fwd<InLayout,
+                                WeiLayout,
+                                OutLayout,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementwiseOperation,
+                                WeiElementwiseOperation,
+                                OutElementwiseOperation>(arg.p_in_,
+                                                         arg.p_wei_,
+                                                         arg.p_out_,
+                                                         arg.params_,
+                                                         arg.in_element_op_,
+                                                         arg.wei_element_op_,
+                                                         arg.out_element_op_,
+                                                         stream_config.stream_id_);
+            return 0; // No timing for naive implementation
         }
 
         // polymorphic
@@ -155,7 +148,9 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        std::vector<index_t> out_spatial_lengths = arg.params_.GetOutputSpatialLengths();
+        auto out_spatial_lengths_long = arg.params_.GetOutputSpatialLengths();
+        std::vector<index_t> out_spatial_lengths(out_spatial_lengths_long.begin(),
+                                                 out_spatial_lengths_long.end());
 
         bool out_lengths_are_consistent = out_spatial_lengths[0] == arg.out_spatial_lengths_[0] &&
                                           out_spatial_lengths[1] == arg.out_spatial_lengths_[1] &&
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp b/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp
deleted file mode 100644
index 285271c6ef..0000000000
--- a/library/include/ck/library/reference_tensor_operation/gpu/conv_common.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#ifndef CONV_COMMON_HPP
-#define CONV_COMMON_HPP
-
-#include "ck/ck.hpp"
-
-namespace ck {
-namespace ref {
-
-// Device-compatible dimension structure for GPU reference kernels
-// Replaces passing 24 individual parameters
-struct ConvDims
-{
-    index_t N, K, C;
-    index_t Di, Hi, Wi;
-    index_t Z, Y, X;
-    index_t Do, Ho, Wo;
-    index_t stride_z, stride_y, stride_x;
-    index_t dilation_z, dilation_y, dilation_x;
-    index_t pad_z, pad_y, pad_x;
-};
-
-} // namespace ref
-
-// Helper function to extract dimensions from ConvParam for GPU kernels
-// Defined in ck::utils::conv namespace for convenience
-namespace utils {
-namespace conv {
-
-inline ck::ref::ConvDims
-extract_conv_dims(const ConvParam& conv_param, ck::index_t NDimSpatial, bool apply_group = true)
-{
-    ck::ref::ConvDims dims;
-    dims.N = conv_param.N_;
-    dims.K = conv_param.K_;
-    dims.C = apply_group ? (conv_param.C_ * conv_param.G_) : conv_param.C_;
-
-    dims.Di = (NDimSpatial >= 3) ? conv_param.input_spatial_lengths_[0] : 1;
-    dims.Hi = (NDimSpatial >= 2) ? conv_param.input_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
-    dims.Wi = conv_param.input_spatial_lengths_[NDimSpatial - 1];
-
-    dims.Z = (NDimSpatial >= 3) ? conv_param.filter_spatial_lengths_[0] : 1;
-    dims.Y = (NDimSpatial >= 2) ? conv_param.filter_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
-    dims.X = conv_param.filter_spatial_lengths_[NDimSpatial - 1];
-
-    dims.Do = (NDimSpatial >= 3) ? conv_param.output_spatial_lengths_[0] : 1;
-    dims.Ho = (NDimSpatial >= 2) ? conv_param.output_spatial_lengths_[NDimSpatial >= 3 ? 1 : 0] : 1;
-    dims.Wo = conv_param.output_spatial_lengths_[NDimSpatial - 1];
-
-    dims.stride_z = (NDimSpatial >= 3) ? conv_param.conv_filter_strides_[0] : 1;
-    dims.stride_y =
-        (NDimSpatial >= 2) ? conv_param.conv_filter_strides_[NDimSpatial >= 3 ? 1 : 0] : 1;
-    dims.stride_x = conv_param.conv_filter_strides_[NDimSpatial - 1];
-
-    dims.dilation_z = (NDimSpatial >= 3) ? conv_param.conv_filter_dilations_[0] : 1;
-    dims.dilation_y =
-        (NDimSpatial >= 2) ? conv_param.conv_filter_dilations_[NDimSpatial >= 3 ? 1 : 0] : 1;
-    dims.dilation_x = conv_param.conv_filter_dilations_[NDimSpatial - 1];
-
-    dims.pad_z = (NDimSpatial >= 3) ? conv_param.input_left_pads_[0] : 0;
-    dims.pad_y = (NDimSpatial >= 2) ? conv_param.input_left_pads_[NDimSpatial >= 3 ? 1 : 0] : 0;
-    dims.pad_x = conv_param.input_left_pads_[NDimSpatial - 1];
-
-    return dims;
-}
-
-} // namespace conv
-} // namespace utils
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp
index 686b1a0d34..aecf519c10 100644
--- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp
@@ -4,146 +4,515 @@
 #pragma once
 
 #include "ck/utility/type_convert.hpp"
-#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck {
 namespace ref {
 
-/*
- * \brief naive implementation of 3D convolution backward data.
- *        Layout is (NDHWC, KZYXC, NDHWK).
- *        Computes gradient with respect to input.
- *
- * \param N number of batches
- * \param K number of filters (output channels)
- * \param C number of input channels
- * \param (Di, Hi, Wi) depth, height and width dimension of input
- * \param (Z, Y, X) depth, height and width dimensions of filter
- * \param (Do, Ho, Wo) depth, height and width dimension of output
- * \param (stride_z, stride_y, stride_x) strides
- * \param (dilation_z, dilation_y, dilation_x) dilations
- * \param (pad_z, pad_y, pad_x) pads
- */
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename TAcc,
-          typename InElementwiseOperation,
-          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
-__global__ void naive_conv_bwd_data_ndhwc_kzyxc_ndhwk(TIn* __restrict__ p_in_grad,
-                                                      const TWei* __restrict__ p_wei,
-                                                      const TOut* __restrict__ p_out_grad,
-                                                      const ConvDims dims)
+// Optimized backward data convolution kernel working with packed (contiguous) tensors
+// Computes gradients w.r.t. input from output gradients and weights
+// Assumes row-major packing: input[G][N][C][spatial], weight[G][K][C][filter],
+// output[G][N][K][spatial]
+template <index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp>
+__global__ void naive_conv_bwd_data_packed(InDataType* __restrict__ p_in,
+                                           const WeiDataType* __restrict__ p_wei,
+                                           const OutDataType* __restrict__ p_out,
+                                           index_t G,
+                                           index_t N,
+                                           index_t K,
+                                           index_t C,
+                                           index_t Di,
+                                           index_t Hi,
+                                           index_t Wi,
+                                           index_t Z,
+                                           index_t Y,
+                                           index_t X,
+                                           index_t Do,
+                                           index_t Ho,
+                                           index_t Wo,
+                                           index_t stride_z,
+                                           index_t stride_y,
+                                           index_t stride_x,
+                                           index_t dilation_z,
+                                           index_t dilation_y,
+                                           index_t dilation_x,
+                                           index_t pad_z,
+                                           index_t pad_y,
+                                           index_t pad_x,
+                                           InElementOp in_op,
+                                           WeiElementOp wei_op,
+                                           OutElementOp out_op)
 {
-    const index_t tid               = blockIdx.x * blockDim.x + threadIdx.x;
-    const index_t num_threads       = blockDim.x * gridDim.x;
-    const long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C;
+    const long_index_t tid         = blockIdx.x * blockDim.x + threadIdx.x;
+    const long_index_t num_threads = blockDim.x * gridDim.x;
 
-    const index_t in_strides[] = {
-        dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
-    const index_t out_strides[] = {
-        dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
-    const index_t wei_strides[] = {
-        dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
+    InDataType in_val   = InDataType{0};
+    WeiDataType wei_val = WeiDataType{0};
+    OutDataType out_val = OutDataType{0};
 
-    constexpr auto in_op  = InElementwiseOperation{};
-    constexpr auto wei_op = WeiElementwiseOperation{};
-    constexpr auto out_op = OutElementwiseOperation{};
-
-    TIn in_val   = TIn{0};
-    TWei wei_val = TWei{0};
-    TOut out_val = TOut{0};
-
-    for(long_index_t ii = tid; ii < input_length; ii += num_threads)
+    if constexpr(NDimSpatial == 1)
     {
-        // Decode linear index to (n, di, hi, wi, c)
-        const index_t n  = ii / in_strides[0];
-        index_t tmp      = ii - n * in_strides[0];
-        const index_t di = tmp / in_strides[1];
-        tmp -= di * in_strides[1];
-        const index_t hi = tmp / in_strides[2];
-        tmp -= hi * in_strides[2];
-        const index_t wi = tmp / in_strides[3];
-        tmp -= wi * in_strides[3];
-        const index_t c = tmp;
+        const long_index_t num_in       = G * N * C * Wi;
+        const long_index_t out_stride_g = N * K * Wo;
+        const long_index_t out_stride_n = K * Wo;
+        const long_index_t out_stride_k = Wo;
+        const long_index_t wei_stride_g = K * C * X;
+        const long_index_t wei_stride_k = C * X;
+        const long_index_t wei_stride_c = X;
+        const long_index_t in_stride_g  = N * C * Wi;
+        const long_index_t in_stride_n  = C * Wi;
+        const long_index_t in_stride_c  = Wi;
 
-        // Always accumulate in float
-        float acc_float = 0.0f;
-
-        const TOut* out_n = p_out_grad + static_cast<long_index_t>(n) * out_strides[0];
-
-        // Loop over output channels
-        for(index_t k = 0; k < dims.K; ++k)
+        for(long_index_t idx = tid; idx < num_in; idx += num_threads)
         {
-            const TWei* wei_k = p_wei + static_cast<long_index_t>(k) * wei_strides[0];
+            index_t remaining = idx;
+            const index_t wi  = remaining % Wi;
+            remaining /= Wi;
+            const index_t c = remaining % C;
+            remaining /= C;
+            const index_t n = remaining % N;
+            const index_t g = remaining / N;
 
-            // Loop over filter dimensions
-            for(index_t z = 0; z < dims.Z; ++z)
+            float acc                 = 0.0f;
+            const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
+            const WeiDataType* wei_g  = p_wei + g * wei_stride_g;
+
+            for(index_t x = 0; x < X; ++x)
             {
-                // Calculate output position from input position (inverse of forward)
-                index_t d_tmp = di + dims.pad_z - z * dims.dilation_z;
-                if(d_tmp % dims.stride_z != 0)
-                    continue;
-                index_t d_o = d_tmp / dims.stride_z;
-                if(d_o < 0 || d_o >= dims.Do)
-                    continue;
-
-                const TOut* out_n_do = out_n + d_o * out_strides[1];
-                const TWei* wei_k_z  = wei_k + z * wei_strides[1];
-
-                for(index_t y = 0; y < dims.Y; ++y)
+                long_index_t w_tmp = wi + pad_x - x * dilation_x;
+                if(w_tmp % stride_x == 0)
                 {
-                    index_t h_tmp = hi + dims.pad_y - y * dims.dilation_y;
-                    if(h_tmp % dims.stride_y != 0)
-                        continue;
-                    index_t ho = h_tmp / dims.stride_y;
-                    if(ho < 0 || ho >= dims.Ho)
-                        continue;
-
-                    const TOut* out_n_do_ho = out_n_do + ho * out_strides[2];
-                    const TWei* wei_k_z_y   = wei_k_z + y * wei_strides[2];
-
-                    for(index_t x = 0; x < dims.X; ++x)
+                    long_index_t wo = w_tmp / stride_x;
+                    if(wo >= 0 && wo < Wo)
                     {
-                        index_t w_tmp = wi + dims.pad_x - x * dims.dilation_x;
-                        if(w_tmp % dims.stride_x != 0)
-                            continue;
-                        index_t wo = w_tmp / dims.stride_x;
-                        if(wo < 0 || wo >= dims.Wo)
-                            continue;
+                        const OutDataType* out_gnk = out_gn;
+                        const WeiDataType* wei_gkc = wei_g + c * wei_stride_c;
 
-                        const TOut* out_n_do_ho_wo = out_n_do_ho + wo * out_strides[3];
-                        const TWei* wei_k_z_y_x    = wei_k_z_y + x * wei_strides[3];
-
-                        // Load values from memory
-                        TOut out_loaded = out_n_do_ho_wo[k];
-                        TWei wei_loaded = wei_k_z_y_x[c];
-
-                        // Apply element-wise operations (like forward does)
-                        out_op(out_val, out_loaded);
-                        wei_op(wei_val, wei_loaded);
-
-                        // Convert to float for multiplication
-                        float out_f = type_convert<float>(out_val);
-                        float wei_f = type_convert<float>(wei_val);
-
-                        acc_float += out_f * wei_f;
+                        for(index_t k = 0; k < K; ++k)
+                        {
+                            out_op(out_val, out_gnk[k * out_stride_k + wo]);
+                            wei_op(wei_val, wei_gkc[k * wei_stride_k + x]);
+                            acc += type_convert<float>(out_val) * type_convert<float>(wei_val);
+                        }
                     }
                 }
             }
+
+            InDataType result = type_convert<InDataType>(acc);
+            in_op(in_val, result);
+            p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + wi] = in_val;
         }
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        const long_index_t num_in       = G * N * C * Hi * Wi;
+        const long_index_t out_stride_g = N * K * Ho * Wo;
+        const long_index_t out_stride_n = K * Ho * Wo;
+        const long_index_t out_stride_k = Ho * Wo;
+        const long_index_t out_stride_h = Wo;
+        const long_index_t wei_stride_g = K * C * Y * X;
+        const long_index_t wei_stride_k = C * Y * X;
+        const long_index_t wei_stride_c = Y * X;
+        const long_index_t wei_stride_y = X;
+        const long_index_t in_stride_g  = N * C * Hi * Wi;
+        const long_index_t in_stride_n  = C * Hi * Wi;
+        const long_index_t in_stride_c  = Hi * Wi;
+        const long_index_t in_stride_h  = Wi;
 
-        // Convert float accumulator to TAcc, then to input type
-        TAcc acc   = type_convert<TAcc>(acc_float);
-        TIn result = type_convert<TIn>(acc);
+        for(long_index_t idx = tid; idx < num_in; idx += num_threads)
+        {
+            index_t remaining = idx;
+            const index_t wi  = remaining % Wi;
+            remaining /= Wi;
+            const index_t hi = remaining % Hi;
+            remaining /= Hi;
+            const index_t c = remaining % C;
+            remaining /= C;
+            const index_t n = remaining % N;
+            const index_t g = remaining / N;
 
-        // Apply input element-wise operation (if any)
-        in_op(in_val, result);
+            float acc                 = 0.0f;
+            const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
+            const WeiDataType* wei_g  = p_wei + g * wei_stride_g;
 
-        // Write transformed result
-        p_in_grad[ii] = in_val;
+            for(index_t y = 0; y < Y; ++y)
+            {
+                long_index_t h_tmp = hi + pad_y - y * dilation_y;
+                if(h_tmp % stride_y == 0)
+                {
+                    long_index_t ho = h_tmp / stride_y;
+                    if(ho >= 0 && ho < Ho)
+                    {
+                        const OutDataType* out_gnkh = out_gn + ho * out_stride_h;
+                        const WeiDataType* wei_gkcy = wei_g + c * wei_stride_c + y * wei_stride_y;
+
+                        for(index_t x = 0; x < X; ++x)
+                        {
+                            long_index_t w_tmp = wi + pad_x - x * dilation_x;
+                            if(w_tmp % stride_x == 0)
+                            {
+                                long_index_t wo = w_tmp / stride_x;
+                                if(wo >= 0 && wo < Wo)
+                                {
+                                    for(index_t k = 0; k < K; ++k)
+                                    {
+                                        out_op(out_val, out_gnkh[k * out_stride_k + wo]);
+                                        wei_op(wei_val, wei_gkcy[k * wei_stride_k + x]);
+                                        acc += type_convert<float>(out_val) *
+                                               type_convert<float>(wei_val);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            InDataType result = type_convert<InDataType>(acc);
+            in_op(in_val, result);
+            p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + hi * in_stride_h + wi] =
+                in_val;
+        }
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        const long_index_t num_in       = G * N * C * Di * Hi * Wi;
+        const long_index_t out_stride_g = N * K * Do * Ho * Wo;
+        const long_index_t out_stride_n = K * Do * Ho * Wo;
+        const long_index_t out_stride_k = Do * Ho * Wo;
+        const long_index_t out_stride_d = Ho * Wo;
+        const long_index_t out_stride_h = Wo;
+        const long_index_t wei_stride_g = K * C * Z * Y * X;
+        const long_index_t wei_stride_k = C * Z * Y * X;
+        const long_index_t wei_stride_c = Z * Y * X;
+        const long_index_t wei_stride_z = Y * X;
+        const long_index_t wei_stride_y = X;
+        const long_index_t in_stride_g  = N * C * Di * Hi * Wi;
+        const long_index_t in_stride_n  = C * Di * Hi * Wi;
+        const long_index_t in_stride_c  = Di * Hi * Wi;
+        const long_index_t in_stride_d  = Hi * Wi;
+        const long_index_t in_stride_h  = Wi;
+
+        for(long_index_t idx = tid; idx < num_in; idx += num_threads)
+        {
+            index_t remaining = idx;
+            const index_t wi  = remaining % Wi;
+            remaining /= Wi;
+            const index_t hi = remaining % Hi;
+            remaining /= Hi;
+            const index_t di = remaining % Di;
+            remaining /= Di;
+            const index_t c = remaining % C;
+            remaining /= C;
+            const index_t n = remaining % N;
+            const index_t g = remaining / N;
+
+            float acc                 = 0.0f;
+            const OutDataType* out_gn = p_out + g * out_stride_g + n * out_stride_n;
+            const WeiDataType* wei_g  = p_wei + g * wei_stride_g;
+
+            for(index_t z = 0; z < Z; ++z)
+            {
+                long_index_t d_tmp = di + pad_z - z * dilation_z;
+                if(d_tmp % stride_z == 0)
+                {
+                    long_index_t do_idx = d_tmp / stride_z;
+                    if(do_idx >= 0 && do_idx < Do)
+                    {
+                        const OutDataType* out_gnkd = out_gn + do_idx * out_stride_d;
+                        const WeiDataType* wei_gkcz = wei_g + c * wei_stride_c + z * wei_stride_z;
+
+                        for(index_t y = 0; y < Y; ++y)
+                        {
+                            long_index_t h_tmp = hi + pad_y - y * dilation_y;
+                            if(h_tmp % stride_y == 0)
+                            {
+                                long_index_t ho = h_tmp / stride_y;
+                                if(ho >= 0 && ho < Ho)
+                                {
+                                    const OutDataType* out_gnkdh = out_gnkd + ho * out_stride_h;
+                                    const WeiDataType* wei_gkczy = wei_gkcz + y * wei_stride_y;
+
+                                    for(index_t x = 0; x < X; ++x)
+                                    {
+                                        long_index_t w_tmp = wi + pad_x - x * dilation_x;
+                                        if(w_tmp % stride_x == 0)
+                                        {
+                                            long_index_t wo = w_tmp / stride_x;
+                                            if(wo >= 0 && wo < Wo)
+                                            {
+                                                for(index_t k = 0; k < K; ++k)
+                                                {
+                                                    out_op(out_val,
+                                                           out_gnkdh[k * out_stride_k + wo]);
+                                                    wei_op(wei_val,
+                                                           wei_gkczy[k * wei_stride_k + x]);
+                                                    acc += type_convert<float>(out_val) *
+                                                           type_convert<float>(wei_val);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            InDataType result = type_convert<InDataType>(acc);
+            in_op(in_val, result);
+            p_in[g * in_stride_g + n * in_stride_n + c * in_stride_c + di * in_stride_d +
+                 hi * in_stride_h + wi] = in_val;
+        }
     }
 }
+
+// GPU reference backward data convolution - takes ConvParam directly
+template <typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename TIn,
+          typename TWei,
+          typename TOut,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+void naive_conv_bwd_data(TIn* p_in,
+                         const TWei* p_wei,
+                         const TOut* p_out,
+                         const ck::utils::conv::ConvParam& conv_param,
+                         InElementwiseOperation in_element_op   = InElementwiseOperation{},
+                         WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
+                         OutElementwiseOperation out_element_op = OutElementwiseOperation{},
+                         hipStream_t stream                     = nullptr)
+{
+    const auto ndim = conv_param.num_dim_spatial_;
+
+    const index_t G = conv_param.G_;
+    const index_t N = conv_param.N_;
+    const index_t C = conv_param.C_;
+    const index_t K = conv_param.K_;
+
+    std::vector<index_t> in_lengths  = {G, N, C};
+    std::vector<index_t> wei_lengths = {G, K, C};
+    std::vector<index_t> out_lengths = {G, N, K};
+
+    for(index_t i = 0; i < ndim; ++i)
+    {
+        in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
+        wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
+        out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
+    }
+
+    // Calculate total elements for buffer allocation
+    long_index_t in_total = 1, wei_total = 1, out_total = 1;
+    for(auto l : in_lengths)
+        in_total *= l;
+    for(auto l : wei_lengths)
+        wei_total *= l;
+    for(auto l : out_lengths)
+        out_total *= l;
+
+    // Allocate packed buffers
+    SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
+    SimpleDeviceMem wei_packed_buf(wei_total * sizeof(TWei));
+    SimpleDeviceMem out_packed_buf(out_total * sizeof(TOut));
+
+    TIn* p_in_packed   = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
+    TWei* p_wei_packed = static_cast<TWei*>(wei_packed_buf.GetDeviceBuffer());
+    TOut* p_out_packed = static_cast<TOut*>(out_packed_buf.GetDeviceBuffer());
+
+    // Compute strides and allocate device arrays for pack/unpack
+    std::vector<index_t> in_strides  = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
+    std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
+    std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
+
+    const size_t dim_count = in_lengths.size();
+    SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
+
+    index_t* d_in_lengths  = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
+    index_t* d_in_strides  = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
+    index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
+    index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
+    index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
+    index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
+
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+
+    // Pack output and weight tensors to contiguous layout (inputs to bwd data)
+    constexpr int block_size = 256;
+    strided_copy_kernel<TOut, false>
+        <<<(out_total + block_size - 1) / block_size, block_size, 0, stream>>>(
+            p_out, p_out_packed, d_out_lengths, d_out_strides, dim_count, out_total);
+    strided_copy_kernel<TWei, false>
+        <<<(wei_total + block_size - 1) / block_size, block_size, 0, stream>>>(
+            p_wei, p_wei_packed, d_wei_lengths, d_wei_strides, dim_count, wei_total);
+
+    // Build conv parameter vectors for kernel invocation
+    std::vector<index_t> conv_strides(ndim);
+    std::vector<index_t> conv_dilations(ndim);
+    std::vector<index_t> input_pads(ndim);
+    for(index_t i = 0; i < ndim; ++i)
+    {
+        conv_strides[i]   = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
+        conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
+        input_pads[i]     = static_cast<index_t>(conv_param.input_left_pads_[i]);
+    }
+
+    // Run backward data convolution kernel on packed data
+    const int in_grid = (in_total + block_size - 1) / block_size;
+
+    if(ndim == 1)
+    {
+        naive_conv_bwd_data_packed<1,
+                                   TIn,
+                                   TWei,
+                                   TOut,
+                                   InElementwiseOperation,
+                                   WeiElementwiseOperation,
+                                   OutElementwiseOperation>
+            <<<in_grid, block_size, 0, stream>>>(p_in_packed,
+                                                 p_wei_packed,
+                                                 p_out_packed,
+                                                 G,
+                                                 N,
+                                                 K,
+                                                 C,
+                                                 1,
+                                                 1,
+                                                 in_lengths[3],
+                                                 1,
+                                                 1,
+                                                 wei_lengths[3],
+                                                 1,
+                                                 1,
+                                                 out_lengths[3],
+                                                 1,
+                                                 1,
+                                                 conv_strides[0],
+                                                 1,
+                                                 1,
+                                                 conv_dilations[0],
+                                                 0,
+                                                 0,
+                                                 input_pads[0],
+                                                 in_element_op,
+                                                 wei_element_op,
+                                                 out_element_op);
+    }
+    else if(ndim == 2)
+    {
+        naive_conv_bwd_data_packed<2,
+                                   TIn,
+                                   TWei,
+                                   TOut,
+                                   InElementwiseOperation,
+                                   WeiElementwiseOperation,
+                                   OutElementwiseOperation>
+            <<<in_grid, block_size, 0, stream>>>(p_in_packed,
+                                                 p_wei_packed,
+                                                 p_out_packed,
+                                                 G,
+                                                 N,
+                                                 K,
+                                                 C,
+                                                 1,
+                                                 in_lengths[3],
+                                                 in_lengths[4],
+                                                 1,
+                                                 wei_lengths[3],
+                                                 wei_lengths[4],
+                                                 1,
+                                                 out_lengths[3],
+                                                 out_lengths[4],
+                                                 1,
+                                                 conv_strides[0],
+                                                 conv_strides[1],
+                                                 1,
+                                                 conv_dilations[0],
+                                                 conv_dilations[1],
+                                                 0,
+                                                 input_pads[0],
+                                                 input_pads[1],
+                                                 in_element_op,
+                                                 wei_element_op,
+                                                 out_element_op);
+    }
+    else // 3D
+    {
+        naive_conv_bwd_data_packed<3,
+                                   TIn,
+                                   TWei,
+                                   TOut,
+                                   InElementwiseOperation,
+                                   WeiElementwiseOperation,
+                                   OutElementwiseOperation>
+            <<<in_grid, block_size, 0, stream>>>(p_in_packed,
+                                                 p_wei_packed,
+                                                 p_out_packed,
+                                                 G,
+                                                 N,
+                                                 K,
+                                                 C,
+                                                 in_lengths[3],
+                                                 in_lengths[4],
+                                                 in_lengths[5],
+                                                 wei_lengths[3],
+                                                 wei_lengths[4],
+                                                 wei_lengths[5],
+                                                 out_lengths[3],
+                                                 out_lengths[4],
+                                                 out_lengths[5],
+                                                 conv_strides[0],
+                                                 conv_strides[1],
+                                                 conv_strides[2],
+                                                 conv_dilations[0],
+                                                 conv_dilations[1],
+                                                 conv_dilations[2],
+                                                 input_pads[0],
+                                                 input_pads[1],
+                                                 input_pads[2],
+                                                 in_element_op,
+                                                 wei_element_op,
+                                                 out_element_op);
+    }
+
+    // Unpack result back to strided layout
+    strided_copy_kernel<TIn, true><<<in_grid, block_size, 0, stream>>>(
+        p_in_packed, p_in, d_in_lengths, d_in_strides, dim_count, in_total);
+
+    HIP_CHECK_ERROR(hipGetLastError());
+
+    // Memory automatically freed by SimpleDeviceMem destructors
+}
+
 } // namespace ref
 } // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp
index ff44c9fcbd..f46b072baa 100644
--- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp
@@ -4,136 +4,497 @@
 #pragma once
 
 #include "ck/utility/type_convert.hpp"
-#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck {
 namespace ref {
 
-/*
- * \brief naive implementation of 3D convolution backward weight.
- *        Layout is (NDHWC, KZYXC, NDHWK).
- *        Computes gradient with respect to weights.
- *
- * \param N number of batches
- * \param K number of filters (output channels)
- * \param C number of input channels
- * \param (Di, Hi, Wi) depth, height and width dimension of input
- * \param (Z, Y, X) depth, height and width dimensions of filter
- * \param (Do, Ho, Wo) depth, height and width dimension of output
- * \param (stride_z, stride_y, stride_x) strides
- * \param (dilation_z, dilation_y, dilation_x) dilations
- * \param (pad_z, pad_y, pad_x) pads
- */
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename TAcc,
-          typename InElementwiseOperation,
-          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
-__global__ void naive_conv_bwd_weight_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in,
-                                                        TWei* __restrict__ p_wei_grad,
-                                                        const TOut* __restrict__ p_out_grad,
-                                                        const ConvDims dims)
+// Optimized backward weight convolution kernel working with packed (contiguous) tensors
+// Assumes row-major packing: input[G][N][C][spatial], output_grad[G][N][K][spatial],
+// weight_grad[G][K][C][filter]
+// Computes gradient with respect to weights
+template <index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp>
+__global__ void naive_conv_bwd_weight_packed(const InDataType* __restrict__ p_in,
+                                             WeiDataType* __restrict__ p_wei_grad,
+                                             const OutDataType* __restrict__ p_out_grad,
+                                             index_t G,
+                                             index_t N,
+                                             index_t K,
+                                             index_t C,
+                                             index_t Di,
+                                             index_t Hi,
+                                             index_t Wi,
+                                             index_t Z,
+                                             index_t Y,
+                                             index_t X,
+                                             index_t Do,
+                                             index_t Ho,
+                                             index_t Wo,
+                                             index_t stride_z,
+                                             index_t stride_y,
+                                             index_t stride_x,
+                                             index_t dilation_z,
+                                             index_t dilation_y,
+                                             index_t dilation_x,
+                                             index_t pad_z,
+                                             index_t pad_y,
+                                             index_t pad_x,
+                                             InElementOp in_op,
+                                             WeiElementOp wei_op,
+                                             OutElementOp out_op)
 {
-    const index_t tid                = blockIdx.x * blockDim.x + threadIdx.x;
-    const index_t num_threads        = blockDim.x * gridDim.x;
-    const long_index_t weight_length = dims.K * dims.Z * dims.Y * dims.X * dims.C;
+    const long_index_t tid         = blockIdx.x * blockDim.x + threadIdx.x;
+    const long_index_t num_threads = blockDim.x * gridDim.x;
 
-    const index_t in_strides[] = {
-        dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
-    const index_t out_strides[] = {
-        dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
-    const index_t wei_strides[] = {
-        dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
+    InDataType in_val   = InDataType{0};
+    WeiDataType wei_val = WeiDataType{0};
+    OutDataType out_val = OutDataType{0};
 
-    constexpr auto in_op  = InElementwiseOperation{};
-    constexpr auto wei_op = WeiElementwiseOperation{};
-    constexpr auto out_op = OutElementwiseOperation{};
-
-    TIn in_val   = TIn{0};
-    TWei wei_val = TWei{0};
-    TOut out_val = TOut{0};
-
-    for(long_index_t ii = tid; ii < weight_length; ii += num_threads)
+    if constexpr(NDimSpatial == 1)
     {
-        // Decode linear index to (k, z, y, x, c)
-        const index_t k = ii / wei_strides[0];
-        index_t tmp     = ii - k * wei_strides[0];
-        const index_t z = tmp / wei_strides[1];
-        tmp -= z * wei_strides[1];
-        const index_t y = tmp / wei_strides[2];
-        tmp -= y * wei_strides[2];
-        const index_t x = tmp / wei_strides[3];
-        tmp -= x * wei_strides[3];
-        const index_t c = tmp;
+        const long_index_t num_wei      = G * K * C * X;
+        const long_index_t in_stride_g  = N * C * Wi;
+        const long_index_t in_stride_n  = C * Wi;
+        const long_index_t in_stride_c  = Wi;
+        const long_index_t out_stride_g = N * K * Wo;
+        const long_index_t out_stride_n = K * Wo;
+        const long_index_t out_stride_k = Wo;
+        const long_index_t wei_stride_g = K * C * X;
+        const long_index_t wei_stride_k = C * X;
+        const long_index_t wei_stride_c = X;
 
-        // Always accumulate in float
-        float acc_float = 0.0f;
-
-        // Loop over batch
-        for(index_t n = 0; n < dims.N; ++n)
+        for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
         {
-            const TIn* in_n   = p_in + static_cast<long_index_t>(n) * in_strides[0];
-            const TOut* out_n = p_out_grad + static_cast<long_index_t>(n) * out_strides[0];
+            index_t remaining = idx;
+            const index_t x   = remaining % X;
+            remaining /= X;
+            const index_t c = remaining % C;
+            remaining /= C;
+            const index_t k = remaining % K;
+            const index_t g = remaining / K;
 
-            // Loop over output spatial dimensions
-            for(index_t d_o = 0; d_o < dims.Do; ++d_o)
+            float acc                   = 0.0f;
+            const InDataType* in_g      = p_in + g * in_stride_g;
+            const OutDataType* out_grad = p_out_grad + g * out_stride_g;
+
+            // Loop over batch and output positions
+            for(index_t n = 0; n < N; ++n)
             {
-                // Calculate input position from output position
-                index_t di = d_o * dims.stride_z - dims.pad_z + z * dims.dilation_z;
-                if(di < 0 || di >= dims.Di)
-                    continue;
+                const InDataType* in_gn     = in_g + n * in_stride_n + c * in_stride_c;
+                const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
 
-                const TIn* in_n_di   = in_n + di * in_strides[1];
-                const TOut* out_n_do = out_n + d_o * out_strides[1];
-
-                for(index_t ho = 0; ho < dims.Ho; ++ho)
+                for(index_t wo = 0; wo < Wo; ++wo)
                 {
-                    index_t hi = ho * dims.stride_y - dims.pad_y + y * dims.dilation_y;
-                    if(hi < 0 || hi >= dims.Hi)
-                        continue;
-
-                    const TIn* in_n_di_hi   = in_n_di + hi * in_strides[2];
-                    const TOut* out_n_do_ho = out_n_do + ho * out_strides[2];
-
-                    for(index_t wo = 0; wo < dims.Wo; ++wo)
+                    long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
+                    if(wi >= 0 && wi < Wi)
                     {
-                        index_t wi = wo * dims.stride_x - dims.pad_x + x * dims.dilation_x;
-                        if(wi < 0 || wi >= dims.Wi)
-                            continue;
-
-                        // Load values from memory (like forward does)
-                        const TIn* in_ptr   = in_n_di_hi + wi * in_strides[3];
-                        const TOut* out_ptr = out_n_do_ho + wo * out_strides[3];
-
-                        TIn in_loaded   = in_ptr[c];
-                        TOut out_loaded = out_ptr[k];
-
-                        // Apply element-wise operations
-                        in_op(in_val, in_loaded);
-                        out_op(out_val, out_loaded);
-
-                        // Convert to float for multiplication
-                        float in_f  = type_convert<float>(in_val);
-                        float out_f = type_convert<float>(out_val);
-
-                        acc_float += out_f * in_f;
+                        in_op(in_val, in_gn[wi]);
+                        out_op(out_val, out_gn_k[wo]);
+                        acc += type_convert<float>(out_val) * type_convert<float>(in_val);
                     }
                 }
             }
+
+            WeiDataType result = type_convert<WeiDataType>(acc);
+            wei_op(wei_val, result);
+            p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + x] = wei_val;
         }
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        const long_index_t num_wei      = G * K * C * Y * X;
+        const long_index_t in_stride_g  = N * C * Hi * Wi;
+        const long_index_t in_stride_n  = C * Hi * Wi;
+        const long_index_t in_stride_c  = Hi * Wi;
+        const long_index_t in_stride_h  = Wi;
+        const long_index_t out_stride_g = N * K * Ho * Wo;
+        const long_index_t out_stride_n = K * Ho * Wo;
+        const long_index_t out_stride_k = Ho * Wo;
+        const long_index_t out_stride_h = Wo;
+        const long_index_t wei_stride_g = K * C * Y * X;
+        const long_index_t wei_stride_k = C * Y * X;
+        const long_index_t wei_stride_c = Y * X;
+        const long_index_t wei_stride_y = X;
 
-        // Convert float accumulator to TAcc, then to weight type
-        TAcc acc    = type_convert<TAcc>(acc_float);
-        TWei result = type_convert<TWei>(acc);
+        for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
+        {
+            index_t remaining = idx;
+            const index_t x   = remaining % X;
+            remaining /= X;
+            const index_t y = remaining % Y;
+            remaining /= Y;
+            const index_t c = remaining % C;
+            remaining /= C;
+            const index_t k = remaining % K;
+            const index_t g = remaining / K;
 
-        // Apply weight element-wise operation (if any)
-        wei_op(wei_val, result);
+            float acc                   = 0.0f;
+            const InDataType* in_g      = p_in + g * in_stride_g;
+            const OutDataType* out_grad = p_out_grad + g * out_stride_g;
 
-        // Write transformed result
-        p_wei_grad[ii] = wei_val;
+            // Loop over batch and output positions
+            for(index_t n = 0; n < N; ++n)
+            {
+                const InDataType* in_gnc    = in_g + n * in_stride_n + c * in_stride_c;
+                const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
+
+                for(index_t ho = 0; ho < Ho; ++ho)
+                {
+                    long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
+                    if(hi >= 0 && hi < Hi)
+                    {
+                        const InDataType* in_gnch    = in_gnc + hi * in_stride_h;
+                        const OutDataType* out_gn_kh = out_gn_k + ho * out_stride_h;
+
+                        for(index_t wo = 0; wo < Wo; ++wo)
+                        {
+                            long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
+                            if(wi >= 0 && wi < Wi)
+                            {
+                                in_op(in_val, in_gnch[wi]);
+                                out_op(out_val, out_gn_kh[wo]);
+                                acc += type_convert<float>(out_val) * type_convert<float>(in_val);
+                            }
+                        }
+                    }
+                }
+            }
+
+            WeiDataType result = type_convert<WeiDataType>(acc);
+            wei_op(wei_val, result);
+            p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + y * wei_stride_y +
+                       x] = wei_val;
+        }
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        const long_index_t num_wei      = G * K * C * Z * Y * X;
+        const long_index_t in_stride_g  = N * C * Di * Hi * Wi;
+        const long_index_t in_stride_n  = C * Di * Hi * Wi;
+        const long_index_t in_stride_c  = Di * Hi * Wi;
+        const long_index_t in_stride_d  = Hi * Wi;
+        const long_index_t in_stride_h  = Wi;
+        const long_index_t out_stride_g = N * K * Do * Ho * Wo;
+        const long_index_t out_stride_n = K * Do * Ho * Wo;
+        const long_index_t out_stride_k = Do * Ho * Wo;
+        const long_index_t out_stride_d = Ho * Wo;
+        const long_index_t out_stride_h = Wo;
+        const long_index_t wei_stride_g = K * C * Z * Y * X;
+        const long_index_t wei_stride_k = C * Z * Y * X;
+        const long_index_t wei_stride_c = Z * Y * X;
+        const long_index_t wei_stride_z = Y * X;
+        const long_index_t wei_stride_y = X;
+
+        for(long_index_t idx = tid; idx < num_wei; idx += num_threads)
+        {
+            index_t remaining = idx;
+            const index_t x   = remaining % X;
+            remaining /= X;
+            const index_t y = remaining % Y;
+            remaining /= Y;
+            const index_t z = remaining % Z;
+            remaining /= Z;
+            const index_t c = remaining % C;
+            remaining /= C;
+            const index_t k = remaining % K;
+            const index_t g = remaining / K;
+
+            float acc                   = 0.0f;
+            const InDataType* in_g      = p_in + g * in_stride_g;
+            const OutDataType* out_grad = p_out_grad + g * out_stride_g;
+
+            // Loop over batch and output positions
+            for(index_t n = 0; n < N; ++n)
+            {
+                const InDataType* in_gnc    = in_g + n * in_stride_n + c * in_stride_c;
+                const OutDataType* out_gn_k = out_grad + n * out_stride_n + k * out_stride_k;
+
+                for(index_t do_idx = 0; do_idx < Do; ++do_idx)
+                {
+                    long_index_t di = do_idx * stride_z + z * dilation_z - pad_z;
+                    if(di >= 0 && di < Di)
+                    {
+                        const InDataType* in_gncd    = in_gnc + di * in_stride_d;
+                        const OutDataType* out_gn_kd = out_gn_k + do_idx * out_stride_d;
+
+                        for(index_t ho = 0; ho < Ho; ++ho)
+                        {
+                            long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
+                            if(hi >= 0 && hi < Hi)
+                            {
+                                const InDataType* in_gncdh    = in_gncd + hi * in_stride_h;
+                                const OutDataType* out_gn_kdh = out_gn_kd + ho * out_stride_h;
+
+                                for(index_t wo = 0; wo < Wo; ++wo)
+                                {
+                                    long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
+                                    if(wi >= 0 && wi < Wi)
+                                    {
+                                        in_op(in_val, in_gncdh[wi]);
+                                        out_op(out_val, out_gn_kdh[wo]);
+                                        acc += type_convert<float>(out_val) *
+                                               type_convert<float>(in_val);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            WeiDataType result = type_convert<WeiDataType>(acc);
+            wei_op(wei_val, result);
+            p_wei_grad[g * wei_stride_g + k * wei_stride_k + c * wei_stride_c + z * wei_stride_z +
+                       y * wei_stride_y + x] = wei_val;
+        }
     }
 }
+
+// GPU reference backward weight convolution - takes ConvParam directly
+template <typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename TIn,
+          typename TWei,
+          typename TOut,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+void naive_conv_bwd_weight(const TIn* p_in,
+                           TWei* p_wei_grad,
+                           const TOut* p_out,
+                           const ck::utils::conv::ConvParam& conv_param,
+                           InElementwiseOperation in_element_op   = InElementwiseOperation{},
+                           WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
+                           OutElementwiseOperation out_element_op = OutElementwiseOperation{},
+                           hipStream_t stream                     = nullptr)
+{
+    const auto ndim = conv_param.num_dim_spatial_;
+
+    const index_t G = conv_param.G_;
+    const index_t N = conv_param.N_;
+    const index_t C = conv_param.C_;
+    const index_t K = conv_param.K_;
+
+    std::vector<index_t> in_lengths  = {G, N, C};
+    std::vector<index_t> wei_lengths = {G, K, C};
+    std::vector<index_t> out_lengths = {G, N, K};
+
+    for(index_t i = 0; i < ndim; ++i)
+    {
+        in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
+        wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
+        out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
+    }
+
+    // Calculate total elements for buffer allocation
+    long_index_t in_total = 1, wei_total = 1, out_total = 1;
+    for(auto l : in_lengths)
+        in_total *= l;
+    for(auto l : wei_lengths)
+        wei_total *= l;
+    for(auto l : out_lengths)
+        out_total *= l;
+
+    // Allocate packed buffers
+    SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
+    SimpleDeviceMem wei_grad_packed_buf(wei_total * sizeof(TWei));
+    SimpleDeviceMem out_grad_packed_buf(out_total * sizeof(TOut));
+
+    TIn* p_in_packed        = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
+    TWei* p_wei_grad_packed = static_cast<TWei*>(wei_grad_packed_buf.GetDeviceBuffer());
+    TOut* p_out_grad_packed = static_cast<TOut*>(out_grad_packed_buf.GetDeviceBuffer());
+
+    // Compute strides and allocate device arrays for pack/unpack
+    std::vector<index_t> in_strides  = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
+    std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
+    std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
+
+    const size_t dim_count = in_lengths.size();
+    SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
+
+    index_t* d_in_lengths  = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
+    index_t* d_in_strides  = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
+    index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
+    index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
+    index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
+    index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
+
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+
+    // Pack input and output_grad tensors to contiguous layout (inputs to bwd weight)
+    constexpr int block_size = 256;
+    strided_copy_kernel<TIn, false>
+        <<<(in_total + block_size - 1) / block_size, block_size, 0, stream>>>(
+            p_in, p_in_packed, d_in_lengths, d_in_strides, dim_count, in_total);
+    strided_copy_kernel<TOut, false>
+        <<<(out_total + block_size - 1) / block_size, block_size, 0, stream>>>(
+            p_out, p_out_grad_packed, d_out_lengths, d_out_strides, dim_count, out_total);
+
+    // Build conv parameter vectors for kernel invocation
+    std::vector<index_t> conv_strides(ndim);
+    std::vector<index_t> conv_dilations(ndim);
+    std::vector<index_t> input_pads(ndim);
+    for(index_t i = 0; i < ndim; ++i)
+    {
+        conv_strides[i]   = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
+        conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
+        input_pads[i]     = static_cast<index_t>(conv_param.input_left_pads_[i]);
+    }
+
+    // Run backward weight convolution kernel on packed data
+    const int wei_grid = (wei_total + block_size - 1) / block_size;
+
+    if(ndim == 1)
+    {
+        naive_conv_bwd_weight_packed<1,
+                                     TIn,
+                                     TWei,
+                                     TOut,
+                                     InElementwiseOperation,
+                                     WeiElementwiseOperation,
+                                     OutElementwiseOperation>
+            <<<wei_grid, block_size, 0, stream>>>(p_in_packed,
+                                                  p_wei_grad_packed,
+                                                  p_out_grad_packed,
+                                                  G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  1,
+                                                  1,
+                                                  in_lengths[3],
+                                                  1,
+                                                  1,
+                                                  wei_lengths[3],
+                                                  1,
+                                                  1,
+                                                  out_lengths[3],
+                                                  1,
+                                                  1,
+                                                  conv_strides[0],
+                                                  1,
+                                                  1,
+                                                  conv_dilations[0],
+                                                  0,
+                                                  0,
+                                                  input_pads[0],
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+    }
+    else if(ndim == 2)
+    {
+        naive_conv_bwd_weight_packed<2,
+                                     TIn,
+                                     TWei,
+                                     TOut,
+                                     InElementwiseOperation,
+                                     WeiElementwiseOperation,
+                                     OutElementwiseOperation>
+            <<<wei_grid, block_size, 0, stream>>>(p_in_packed,
+                                                  p_wei_grad_packed,
+                                                  p_out_grad_packed,
+                                                  G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  1,
+                                                  in_lengths[3],
+                                                  in_lengths[4],
+                                                  1,
+                                                  wei_lengths[3],
+                                                  wei_lengths[4],
+                                                  1,
+                                                  out_lengths[3],
+                                                  out_lengths[4],
+                                                  1,
+                                                  conv_strides[0],
+                                                  conv_strides[1],
+                                                  1,
+                                                  conv_dilations[0],
+                                                  conv_dilations[1],
+                                                  0,
+                                                  input_pads[0],
+                                                  input_pads[1],
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+    }
+    else // 3D
+    {
+        naive_conv_bwd_weight_packed<3,
+                                     TIn,
+                                     TWei,
+                                     TOut,
+                                     InElementwiseOperation,
+                                     WeiElementwiseOperation,
+                                     OutElementwiseOperation>
+            <<<wei_grid, block_size, 0, stream>>>(p_in_packed,
+                                                  p_wei_grad_packed,
+                                                  p_out_grad_packed,
+                                                  G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  in_lengths[3],
+                                                  in_lengths[4],
+                                                  in_lengths[5],
+                                                  wei_lengths[3],
+                                                  wei_lengths[4],
+                                                  wei_lengths[5],
+                                                  out_lengths[3],
+                                                  out_lengths[4],
+                                                  out_lengths[5],
+                                                  conv_strides[0],
+                                                  conv_strides[1],
+                                                  conv_strides[2],
+                                                  conv_dilations[0],
+                                                  conv_dilations[1],
+                                                  conv_dilations[2],
+                                                  input_pads[0],
+                                                  input_pads[1],
+                                                  input_pads[2],
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+    }
+
+    // Unpack weight gradient
+    strided_copy_kernel<TWei, true><<<wei_grid, block_size, 0, stream>>>(
+        p_wei_grad_packed, p_wei_grad, d_wei_lengths, d_wei_strides, dim_count, wei_total);
+
+    HIP_CHECK_ERROR(hipGetLastError());
+
+    // Memory automatically freed by SimpleDeviceMem destructors
+}
+
 } // namespace ref
 } // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp
index defbbd5be4..131b632a25 100644
--- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp
@@ -4,126 +4,493 @@
 #pragma once
 
 #include "ck/utility/type_convert.hpp"
-#include "ck/library/reference_tensor_operation/gpu/conv_common.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck {
 namespace ref {
 
-/*
- * \brief naive implementation of 3D convolution. Layout is (NDHWC, KZYXC, NDHWK).
- *
- * \param N number of batches
- * \param K number of filters
- * \param C number of channels of weight
- * \param (Di, Hi, Wi) depth, height and width dimension of data
- * \param (Z, Y, X) depth, height and width dimensions of weights
- * \param (Do, Ho, Wo) depth, height and width dimension of output
- * \param (stride_z, stride_y, stride_x) strides
- * \param (dilation_z, dilation_y, dilation_x) dilations
- * \param (pad_z, pad_y, pad_x) pads
- */
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename TAcc,
-          typename InElementwiseOperation,
-          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
-__global__ void naive_conv_fwd_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in,
-                                                 const TWei* __restrict__ p_wei,
-                                                 TOut* __restrict__ p_out,
-                                                 const ConvDims dims)
+// Optimized convolution kernel working with packed (contiguous) tensors
+// Assumes row-major packing: input[G][N][C][spatial], weight[G][K][C][filter],
+// output[G][N][K][spatial]
+template <index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp>
+__global__ void naive_conv_fwd_packed(const InDataType* __restrict__ p_in,
+                                      const WeiDataType* __restrict__ p_wei,
+                                      OutDataType* __restrict__ p_out,
+                                      index_t G,
+                                      index_t N,
+                                      index_t K,
+                                      index_t C,
+                                      index_t Di,
+                                      index_t Hi,
+                                      index_t Wi,
+                                      index_t Z,
+                                      index_t Y,
+                                      index_t X,
+                                      index_t Do,
+                                      index_t Ho,
+                                      index_t Wo,
+                                      index_t stride_z,
+                                      index_t stride_y,
+                                      index_t stride_x,
+                                      index_t dilation_z,
+                                      index_t dilation_y,
+                                      index_t dilation_x,
+                                      index_t pad_z,
+                                      index_t pad_y,
+                                      index_t pad_x,
+                                      InElementOp in_op,
+                                      WeiElementOp wei_op,
+                                      OutElementOp out_op)
 {
-    const index_t tid                = blockIdx.x * blockDim.x + threadIdx.x;
-    const index_t num_threads        = blockDim.x * gridDim.x;
-    const long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K;
+    const long_index_t tid         = blockIdx.x * blockDim.x + threadIdx.x;
+    const long_index_t num_threads = blockDim.x * gridDim.x;
 
-    const index_t out_strides[] = {
-        dims.Do * dims.Ho * dims.Wo * dims.K, dims.Ho * dims.Wo * dims.K, dims.Wo * dims.K, dims.K};
-    const index_t in_strides[] = {
-        dims.Di * dims.Hi * dims.Wi * dims.C, dims.Hi * dims.Wi * dims.C, dims.Wi * dims.C, dims.C};
-    const index_t wei_strides[] = {
-        dims.Z * dims.Y * dims.X * dims.C, dims.Y * dims.X * dims.C, dims.X * dims.C, dims.C};
+    InDataType in_val   = InDataType{0};
+    WeiDataType wei_val = WeiDataType{0};
+    OutDataType out_val = OutDataType{0};
 
-    constexpr auto in_op  = InElementwiseOperation{};
-    constexpr auto wei_op = WeiElementwiseOperation{};
-    constexpr auto out_op = OutElementwiseOperation{};
-
-    TIn in_val   = TIn{0};
-    TWei wei_val = TWei{0};
-    TOut out_val = TOut{0};
-
-    for(long_index_t ii = tid; ii < output_length; ii += num_threads)
+    if constexpr(NDimSpatial == 1)
     {
-        const index_t n  = ii / out_strides[0];
-        index_t k        = ii - n * out_strides[0];
-        const index_t dO = k / out_strides[1];
-        k -= dO * out_strides[1];
-        const index_t ho = k / out_strides[2];
-        k -= ho * out_strides[2];
-        const index_t wo = k / out_strides[3];
-        k -= wo * out_strides[3];
+        const long_index_t num_out      = G * N * K * Wo;
+        const long_index_t in_stride_g  = N * C * Wi;
+        const long_index_t in_stride_n  = C * Wi;
+        const long_index_t in_stride_c  = Wi;
+        const long_index_t wei_stride_g = K * C * X;
+        const long_index_t wei_stride_k = C * X;
+        const long_index_t wei_stride_c = X;
+        const long_index_t out_stride_g = N * K * Wo;
+        const long_index_t out_stride_n = K * Wo;
+        const long_index_t out_stride_k = Wo;
 
-        // Always accumulate in float (FP8/BF8 don't support arithmetic)
-        float acc_float = 0.0f;
-
-        const TIn* in_n   = p_in + static_cast<long_index_t>(n) * in_strides[0];
-        const TWei* wei_k = p_wei + static_cast<long_index_t>(k) * wei_strides[0];
-
-        for(index_t z = 0; z < dims.Z; ++z)
+        for(long_index_t idx = tid; idx < num_out; idx += num_threads)
         {
-            index_t di          = dims.stride_z * dO - dims.pad_z + dims.dilation_z * z;
-            const TIn* in_n_di  = in_n + di * in_strides[1];
-            const TWei* wei_k_z = wei_k + z * wei_strides[1];
+            index_t remaining = idx;
+            const index_t wo  = remaining % Wo;
+            remaining /= Wo;
+            const index_t k = remaining % K;
+            remaining /= K;
+            const index_t n = remaining % N;
+            const index_t g = remaining / N;
 
-            for(index_t y = 0; y < dims.Y; ++y)
+            float acc                 = 0.0f;
+            const InDataType* in_g    = p_in + g * in_stride_g + n * in_stride_n;
+            const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
+
+            for(index_t c = 0; c < C; ++c)
             {
-                index_t hi            = dims.stride_y * ho - dims.pad_y + dims.dilation_y * y;
-                const TIn* in_n_di_hi = in_n_di + hi * in_strides[2];
-                const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2];
+                const InDataType* in_gc    = in_g + c * in_stride_c;
+                const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
 
-                for(index_t x = 0; x < dims.X; ++x)
+                for(index_t x = 0; x < X; ++x)
                 {
-                    index_t wi = dims.stride_x * wo - dims.pad_x + dims.dilation_x * x;
-                    const TIn* in_n_di_hi_wi = in_n_di_hi + wi * in_strides[3];
-                    const TWei* wei_k_z_y_x  = wei_k_z_y + x * wei_strides[3];
-
-                    if(di >= 0 && di < dims.Di && hi >= 0 && hi < dims.Hi && wi >= 0 &&
-                       wi < dims.Wi)
+                    long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
+                    if(wi >= 0 && wi < Wi)
                     {
-                        for(index_t c = 0; c < dims.C; ++c)
+                        in_op(in_val, in_gc[wi]);
+                        wei_op(wei_val, wei_gkc[x]);
+                        acc += type_convert<float>(in_val) * type_convert<float>(wei_val);
+                    }
+                }
+            }
+
+            OutDataType result = type_convert<OutDataType>(acc);
+            out_op(out_val, result);
+            p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + wo] = out_val;
+        }
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        const long_index_t num_out      = G * N * K * Ho * Wo;
+        const long_index_t in_stride_g  = N * C * Hi * Wi;
+        const long_index_t in_stride_n  = C * Hi * Wi;
+        const long_index_t in_stride_c  = Hi * Wi;
+        const long_index_t in_stride_h  = Wi;
+        const long_index_t wei_stride_g = K * C * Y * X;
+        const long_index_t wei_stride_k = C * Y * X;
+        const long_index_t wei_stride_c = Y * X;
+        const long_index_t wei_stride_y = X;
+        const long_index_t out_stride_g = N * K * Ho * Wo;
+        const long_index_t out_stride_n = K * Ho * Wo;
+        const long_index_t out_stride_k = Ho * Wo;
+        const long_index_t out_stride_h = Wo;
+
+        for(long_index_t idx = tid; idx < num_out; idx += num_threads)
+        {
+            index_t remaining = idx;
+            const index_t wo  = remaining % Wo;
+            remaining /= Wo;
+            const index_t ho = remaining % Ho;
+            remaining /= Ho;
+            const index_t k = remaining % K;
+            remaining /= K;
+            const index_t n = remaining % N;
+            const index_t g = remaining / N;
+
+            float acc                 = 0.0f;
+            const InDataType* in_gn   = p_in + g * in_stride_g + n * in_stride_n;
+            const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
+
+            for(index_t c = 0; c < C; ++c)
+            {
+                const InDataType* in_gnc   = in_gn + c * in_stride_c;
+                const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
+
+                for(index_t y = 0; y < Y; ++y)
+                {
+                    long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
+                    if(hi >= 0 && hi < Hi)
+                    {
+                        const InDataType* in_gnch   = in_gnc + hi * in_stride_h;
+                        const WeiDataType* wei_gkcy = wei_gkc + y * wei_stride_y;
+
+                        for(index_t x = 0; x < X; ++x)
                         {
-                            // Load values from memory
-                            TIn in_loaded   = in_n_di_hi_wi[c];
-                            TWei wei_loaded = wei_k_z_y_x[c];
-
-                            // Apply element-wise operations
-                            in_op(in_val, in_loaded);
-                            wei_op(wei_val, wei_loaded);
-
-                            // Always convert to float for multiplication (FP8/BF8 don't support
-                            // direct arithmetic)
-                            float in_f  = type_convert<float>(in_val);
-                            float wei_f = type_convert<float>(wei_val);
-
-                            // Accumulate in float
-                            acc_float += in_f * wei_f;
+                            long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
+                            if(wi >= 0 && wi < Wi)
+                            {
+                                in_op(in_val, in_gnch[wi]);
+                                wei_op(wei_val, wei_gkcy[x]);
+                                acc += type_convert<float>(in_val) * type_convert<float>(wei_val);
+                            }
                         }
                     }
                 }
             }
+
+            OutDataType result = type_convert<OutDataType>(acc);
+            out_op(out_val, result);
+            p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + ho * out_stride_h + wo] =
+                out_val;
         }
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        const long_index_t num_out      = G * N * K * Do * Ho * Wo;
+        const long_index_t in_stride_g  = N * C * Di * Hi * Wi;
+        const long_index_t in_stride_n  = C * Di * Hi * Wi;
+        const long_index_t in_stride_c  = Di * Hi * Wi;
+        const long_index_t in_stride_d  = Hi * Wi;
+        const long_index_t in_stride_h  = Wi;
+        const long_index_t wei_stride_g = K * C * Z * Y * X;
+        const long_index_t wei_stride_k = C * Z * Y * X;
+        const long_index_t wei_stride_c = Z * Y * X;
+        const long_index_t wei_stride_z = Y * X;
+        const long_index_t wei_stride_y = X;
+        const long_index_t out_stride_g = N * K * Do * Ho * Wo;
+        const long_index_t out_stride_n = K * Do * Ho * Wo;
+        const long_index_t out_stride_k = Do * Ho * Wo;
+        const long_index_t out_stride_d = Ho * Wo;
+        const long_index_t out_stride_h = Wo;
 
-        // Convert float accumulator to TAcc, then to output type
-        TAcc acc    = type_convert<TAcc>(acc_float);
-        TOut result = type_convert<TOut>(acc);
+        for(long_index_t idx = tid; idx < num_out; idx += num_threads)
+        {
+            index_t remaining = idx;
+            const index_t wo  = remaining % Wo;
+            remaining /= Wo;
+            const index_t ho = remaining % Ho;
+            remaining /= Ho;
+            const index_t do_idx = remaining % Do;
+            remaining /= Do;
+            const index_t k = remaining % K;
+            remaining /= K;
+            const index_t n = remaining % N;
+            const index_t g = remaining / N;
 
-        // Apply output element-wise operation (if any)
-        out_op(out_val, result);
+            float acc                 = 0.0f;
+            const InDataType* in_gn   = p_in + g * in_stride_g + n * in_stride_n;
+            const WeiDataType* wei_gk = p_wei + g * wei_stride_g + k * wei_stride_k;
 
-        // Write transformed result
-        p_out[ii] = out_val;
+            for(index_t c = 0; c < C; ++c)
+            {
+                const InDataType* in_gnc   = in_gn + c * in_stride_c;
+                const WeiDataType* wei_gkc = wei_gk + c * wei_stride_c;
+
+                for(index_t z = 0; z < Z; ++z)
+                {
+                    long_index_t di = do_idx * stride_z + z * dilation_z - pad_z;
+                    if(di >= 0 && di < Di)
+                    {
+                        const InDataType* in_gncd   = in_gnc + di * in_stride_d;
+                        const WeiDataType* wei_gkcz = wei_gkc + z * wei_stride_z;
+
+                        for(index_t y = 0; y < Y; ++y)
+                        {
+                            long_index_t hi = ho * stride_y + y * dilation_y - pad_y;
+                            if(hi >= 0 && hi < Hi)
+                            {
+                                const InDataType* in_gncdh   = in_gncd + hi * in_stride_h;
+                                const WeiDataType* wei_gkczy = wei_gkcz + y * wei_stride_y;
+
+                                for(index_t x = 0; x < X; ++x)
+                                {
+                                    long_index_t wi = wo * stride_x + x * dilation_x - pad_x;
+                                    if(wi >= 0 && wi < Wi)
+                                    {
+                                        in_op(in_val, in_gncdh[wi]);
+                                        wei_op(wei_val, wei_gkczy[x]);
+                                        acc += type_convert<float>(in_val) *
+                                               type_convert<float>(wei_val);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            OutDataType result = type_convert<OutDataType>(acc);
+            out_op(out_val, result);
+            p_out[g * out_stride_g + n * out_stride_n + k * out_stride_k + do_idx * out_stride_d +
+                  ho * out_stride_h + wo] = out_val;
+        }
     }
 }
+
+// GPU reference convolution - takes ConvParam directly
+template <typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename TIn,
+          typename TWei,
+          typename TOut,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+void naive_conv_fwd(const TIn* p_in,
+                    const TWei* p_wei,
+                    TOut* p_out,
+                    const ck::utils::conv::ConvParam& conv_param,
+                    InElementwiseOperation in_element_op   = InElementwiseOperation{},
+                    WeiElementwiseOperation wei_element_op = WeiElementwiseOperation{},
+                    OutElementwiseOperation out_element_op = OutElementwiseOperation{},
+                    hipStream_t stream                     = nullptr)
+{
+    const auto ndim = conv_param.num_dim_spatial_;
+
+    const index_t G = conv_param.G_;
+    const index_t N = conv_param.N_;
+    const index_t C = conv_param.C_;
+    const index_t K = conv_param.K_;
+
+    std::vector<index_t> in_lengths  = {G, N, C};
+    std::vector<index_t> wei_lengths = {G, K, C};
+    std::vector<index_t> out_lengths = {G, N, K};
+
+    for(index_t i = 0; i < ndim; ++i)
+    {
+        in_lengths.push_back(static_cast<index_t>(conv_param.input_spatial_lengths_[i]));
+        wei_lengths.push_back(static_cast<index_t>(conv_param.filter_spatial_lengths_[i]));
+        out_lengths.push_back(static_cast<index_t>(conv_param.output_spatial_lengths_[i]));
+    }
+
+    // Calculate total elements for buffer allocation
+    long_index_t in_total = 1, wei_total = 1, out_total = 1;
+    for(auto l : in_lengths)
+        in_total *= l;
+    for(auto l : wei_lengths)
+        wei_total *= l;
+    for(auto l : out_lengths)
+        out_total *= l;
+
+    // Allocate packed buffers
+    SimpleDeviceMem in_packed_buf(in_total * sizeof(TIn));
+    SimpleDeviceMem wei_packed_buf(wei_total * sizeof(TWei));
+    SimpleDeviceMem out_packed_buf(out_total * sizeof(TOut));
+
+    TIn* p_in_packed   = static_cast<TIn*>(in_packed_buf.GetDeviceBuffer());
+    TWei* p_wei_packed = static_cast<TWei*>(wei_packed_buf.GetDeviceBuffer());
+    TOut* p_out_packed = static_cast<TOut*>(out_packed_buf.GetDeviceBuffer());
+
+    // Compute strides and allocate device arrays for pack/unpack
+    std::vector<index_t> in_strides  = compute_conv_tensor_strides<InLayout>(in_lengths, ndim);
+    std::vector<index_t> wei_strides = compute_conv_tensor_strides<WeiLayout>(wei_lengths, ndim);
+    std::vector<index_t> out_strides = compute_conv_tensor_strides<OutLayout>(out_lengths, ndim);
+
+    const size_t dim_count = in_lengths.size();
+    SimpleDeviceMem in_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem in_strides_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem wei_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem wei_strides_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem out_lengths_buf(dim_count * sizeof(index_t));
+    SimpleDeviceMem out_strides_buf(dim_count * sizeof(index_t));
+
+    index_t* d_in_lengths  = static_cast<index_t*>(in_lengths_buf.GetDeviceBuffer());
+    index_t* d_in_strides  = static_cast<index_t*>(in_strides_buf.GetDeviceBuffer());
+    index_t* d_wei_lengths = static_cast<index_t*>(wei_lengths_buf.GetDeviceBuffer());
+    index_t* d_wei_strides = static_cast<index_t*>(wei_strides_buf.GetDeviceBuffer());
+    index_t* d_out_lengths = static_cast<index_t*>(out_lengths_buf.GetDeviceBuffer());
+    index_t* d_out_strides = static_cast<index_t*>(out_strides_buf.GetDeviceBuffer());
+
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_in_lengths, in_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_in_strides, in_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_wei_lengths, wei_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_wei_strides, wei_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_out_lengths, out_lengths.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+    HIP_CHECK_ERROR(hipMemcpy(
+        d_out_strides, out_strides.data(), dim_count * sizeof(index_t), hipMemcpyHostToDevice));
+
+    // Pack input and weight tensors to contiguous layout
+    constexpr int block_size = 256;
+    strided_copy_kernel<TIn, false>
+        <<<(in_total + block_size - 1) / block_size, block_size, 0, stream>>>(
+            p_in, p_in_packed, d_in_lengths, d_in_strides, dim_count, in_total);
+    strided_copy_kernel<TWei, false>
+        <<<(wei_total + block_size - 1) / block_size, block_size, 0, stream>>>(
+            p_wei, p_wei_packed, d_wei_lengths, d_wei_strides, dim_count, wei_total);
+
+    // Build conv parameter vectors for kernel invocation
+    std::vector<index_t> conv_strides(ndim);
+    std::vector<index_t> conv_dilations(ndim);
+    std::vector<index_t> input_pads(ndim);
+    for(index_t i = 0; i < ndim; ++i)
+    {
+        conv_strides[i]   = static_cast<index_t>(conv_param.conv_filter_strides_[i]);
+        conv_dilations[i] = static_cast<index_t>(conv_param.conv_filter_dilations_[i]);
+        input_pads[i]     = static_cast<index_t>(conv_param.input_left_pads_[i]);
+    }
+
+    // Run convolution kernel on packed data
+    const int out_grid = (out_total + block_size - 1) / block_size;
+
+    if(ndim == 1)
+    {
+        naive_conv_fwd_packed<1,
+                              TIn,
+                              TWei,
+                              TOut,
+                              InElementwiseOperation,
+                              WeiElementwiseOperation,
+                              OutElementwiseOperation>
+            <<<out_grid, block_size, 0, stream>>>(p_in_packed,
+                                                  p_wei_packed,
+                                                  p_out_packed,
+                                                  G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  1,
+                                                  1,
+                                                  in_lengths[3],
+                                                  1,
+                                                  1,
+                                                  wei_lengths[3],
+                                                  1,
+                                                  1,
+                                                  out_lengths[3],
+                                                  1,
+                                                  1,
+                                                  conv_strides[0],
+                                                  1,
+                                                  1,
+                                                  conv_dilations[0],
+                                                  0,
+                                                  0,
+                                                  input_pads[0],
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+    }
+    else if(ndim == 2)
+    {
+        naive_conv_fwd_packed<2,
+                              TIn,
+                              TWei,
+                              TOut,
+                              InElementwiseOperation,
+                              WeiElementwiseOperation,
+                              OutElementwiseOperation>
+            <<<out_grid, block_size, 0, stream>>>(p_in_packed,
+                                                  p_wei_packed,
+                                                  p_out_packed,
+                                                  G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  1,
+                                                  in_lengths[3],
+                                                  in_lengths[4],
+                                                  1,
+                                                  wei_lengths[3],
+                                                  wei_lengths[4],
+                                                  1,
+                                                  out_lengths[3],
+                                                  out_lengths[4],
+                                                  1,
+                                                  conv_strides[0],
+                                                  conv_strides[1],
+                                                  1,
+                                                  conv_dilations[0],
+                                                  conv_dilations[1],
+                                                  0,
+                                                  input_pads[0],
+                                                  input_pads[1],
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+    }
+    else // 3D
+    {
+        naive_conv_fwd_packed<3,
+                              TIn,
+                              TWei,
+                              TOut,
+                              InElementwiseOperation,
+                              WeiElementwiseOperation,
+                              OutElementwiseOperation>
+            <<<out_grid, block_size, 0, stream>>>(p_in_packed,
+                                                  p_wei_packed,
+                                                  p_out_packed,
+                                                  G,
+                                                  N,
+                                                  K,
+                                                  C,
+                                                  in_lengths[3],
+                                                  in_lengths[4],
+                                                  in_lengths[5],
+                                                  wei_lengths[3],
+                                                  wei_lengths[4],
+                                                  wei_lengths[5],
+                                                  out_lengths[3],
+                                                  out_lengths[4],
+                                                  out_lengths[5],
+                                                  conv_strides[0],
+                                                  conv_strides[1],
+                                                  conv_strides[2],
+                                                  conv_dilations[0],
+                                                  conv_dilations[1],
+                                                  conv_dilations[2],
+                                                  input_pads[0],
+                                                  input_pads[1],
+                                                  input_pads[2],
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+    }
+
+    // Unpack
+    strided_copy_kernel<TOut, true><<<out_grid, block_size, 0, stream>>>(
+        p_out_packed, p_out, d_out_lengths, d_out_strides, dim_count, out_total);
+
+    HIP_CHECK_ERROR(hipGetLastError());
+
+    // Memory automatically freed by SimpleDeviceMem destructors
+}
+
 } // namespace ref
 } // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp
new file mode 100644
index 0000000000..0a7b58b310
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp
@@ -0,0 +1,177 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include <hip/hip_runtime.h>
+#include <vector>
+
+namespace ck {
+namespace ref {
+
+// RAII wrapper for device memory to prevent leaks
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&p_mem_), mem_size));
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+// Helper function to map layout dimension character to index in lengths array
+// lengths array structure: [G, N/K, C/K, spatial...]
+inline int map_dim_char_to_index(char dim_char, index_t ndim_spatial, bool is_weight)
+{
+    // G dimension
+    if(dim_char == 'G')
+        return 0;
+
+    // Batch/output channels dimension (N for input/output, K for weight's first non-G dim)
+    if(dim_char == 'N')
+        return 1;
+    if(dim_char == 'K' && is_weight)
+        return 1;
+
+    // Channel dimension (C for input/weight, K for output)
+    if(dim_char == 'C')
+        return 2;
+    if(dim_char == 'K' && !is_weight)
+        return 2;
+
+    // Spatial dimensions - map based on ndim_spatial
+    // Input/Output use: D/H/W, Weight uses: Z/Y/X
+    if(ndim_spatial == 1)
+    {
+        if(dim_char == 'W' || dim_char == 'X')
+            return 3;
+    }
+    else if(ndim_spatial == 2)
+    {
+        if(dim_char == 'H' || dim_char == 'Y')
+            return 3;
+        if(dim_char == 'W' || dim_char == 'X')
+            return 4;
+    }
+    else if(ndim_spatial == 3)
+    {
+        if(dim_char == 'D' || dim_char == 'Z')
+            return 3;
+        if(dim_char == 'H' || dim_char == 'Y')
+            return 4;
+        if(dim_char == 'W' || dim_char == 'X')
+            return 5;
+    }
+
+    // Should not reach here
+    return -1;
+}
+
+// Template function to compute layout-aware strides based on layout name
+// The layout name directly encodes memory ordering from left to right
+template <typename Layout>
+inline std::vector<index_t> compute_conv_tensor_strides(const std::vector<index_t>& lengths,
+                                                        index_t ndim_spatial)
+{
+    constexpr const char* layout_name = Layout::name;
+    const int num_dims                = static_cast<int>(lengths.size());
+    std::vector<index_t> strides(num_dims, 0);
+
+    // Determine if this is a weight tensor (has 'K' but not 'N')
+    bool has_k = false;
+    bool has_n = false;
+    for(const char* p = layout_name; *p != '\0'; ++p)
+    {
+        if(*p == 'K')
+            has_k = true;
+        if(*p == 'N')
+            has_n = true;
+    }
+    bool is_weight = has_k && !has_n;
+
+    // Build dimension ordering from layout name (parse string)
+    std::vector<char> dim_order;
+    const char dim_chars[] = {'G', 'N', 'K', 'C', 'D', 'H', 'W', 'X', 'Y', 'Z'};
+    for(const char* p = layout_name; *p != '\0'; ++p)
+    {
+        char c = *p;
+        // Skip underscores (strided layouts)
+        if(c == '_')
+            continue;
+        // Valid dimension characters
+        if(std::find(std::begin(dim_chars), std::end(dim_chars), c) != std::end(dim_chars))
+        {
+            dim_order.push_back(c);
+        }
+    }
+
+    // Compute strides: process from right to left (innermost to outermost)
+    index_t stride = 1;
+    for(int i = static_cast<int>(dim_order.size()) - 1; i >= 0; --i)
+    {
+        char dim_char  = dim_order[i];
+        int length_idx = map_dim_char_to_index(dim_char, ndim_spatial, is_weight);
+
+        if(length_idx >= 0 && length_idx < num_dims)
+        {
+            strides[length_idx] = stride;
+            stride *= lengths[length_idx];
+        }
+    }
+
+    return strides;
+}
+
+// Unified kernel for strided tensor copy operations
+// IsUnpack=false: Pack strided -> contiguous
+// IsUnpack=true:  Unpack contiguous -> strided
+template <typename DataType, bool IsUnpack>
+__global__ void strided_copy_kernel(const DataType* __restrict__ src,
+                                    DataType* __restrict__ dst,
+                                    const index_t* tensor_lengths,
+                                    const index_t* strided_strides,
+                                    int num_dims,
+                                    long_index_t total_elements)
+{
+    const long_index_t tid         = blockIdx.x * blockDim.x + threadIdx.x;
+    const long_index_t num_threads = blockDim.x * gridDim.x;
+
+    for(long_index_t linear_idx = tid; linear_idx < total_elements; linear_idx += num_threads)
+    {
+        // Compute strided index from linear index
+        long_index_t remaining   = linear_idx;
+        long_index_t strided_idx = 0;
+
+        for(int dim = num_dims - 1; dim >= 0; --dim)
+        {
+            index_t coord = remaining % tensor_lengths[dim];
+            remaining /= tensor_lengths[dim];
+            strided_idx += coord * strided_strides[dim];
+        }
+
+        // Direction determines which is src and which is dst
+        if constexpr(IsUnpack)
+        {
+            // Unpack: src is contiguous (linear_idx), dst is strided (strided_idx)
+            dst[strided_idx] = src[linear_idx];
+        }
+        else
+        {
+            // Pack: src is strided (strided_idx), dst is contiguous (linear_idx)
+            dst[linear_idx] = src[strided_idx];
+        }
+    }
+}
+
+} // namespace ref
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index b9e463dc1e..67d082d07b 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -18,6 +18,7 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
 
 namespace ck {
@@ -89,8 +90,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     wei_device_buf.ToDevice(wei.mData.data());
 
     float max_accumulated_value = 0;
-    if(do_verification)
+    if(do_verification == 2)
     {
+        // Use GPU reference for verification
+        std::cout << "Using GPU reference for verification" << std::endl;
+
+        // Allocate GPU reference output buffer
+        DeviceMem gpu_ref_in_buf(sizeof(InDataType) * in_host.mDesc.GetElementSpaceSize());
+
+        // Call GPU reference with ConvParam directly
+        ref::naive_conv_bwd_data<InLayout,
+                                 WeiLayout,
+                                 OutLayout,
+                                 InDataType,
+                                 WeiDataType,
+                                 OutDataType,
+                                 InElementOp,
+                                 WeiElementOp,
+                                 OutElementOp>(
+            reinterpret_cast<InDataType*>(gpu_ref_in_buf.GetDeviceBuffer()),
+            reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            conv_param,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        // Copy GPU reference result to host for comparison
+        gpu_ref_in_buf.FromDevice(in_host.mData.data());
+        max_accumulated_value = *std::max_element(in_host.mData.begin(), in_host.mData.end());
+    }
+    else if(do_verification == 1)
+    {
+        // Use CPU reference for verification (default)
         auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
                                                                          InDataType,
                                                                          WeiDataType,
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index c4de8d4254..c719c45d28 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -23,6 +23,7 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
 
 namespace ck {
 namespace profiler {
@@ -93,29 +94,69 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     float max_accumulated_value = 0;
     if(do_verification)
     {
-        auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
-                                                                               InDataType,
-                                                                               WeiDataType,
-                                                                               OutDataType,
-                                                                               InElementOp,
-                                                                               WeiElementOp,
-                                                                               OutElementOp>{};
-        auto ref_invoker  = ref_conv.MakeInvoker();
-        auto ref_argument = ref_conv.MakeArgument(input,
-                                                  weight_host_result,
-                                                  output,
-                                                  conv_param.conv_filter_strides_,
-                                                  conv_param.conv_filter_dilations_,
-                                                  conv_param.input_left_pads_,
-                                                  conv_param.input_right_pads_,
-                                                  in_element_op,
-                                                  wei_element_op,
-                                                  out_element_op,
-                                                  {},
-                                                  {},
-                                                  {});
+        if(do_verification == 1)
+        {
+            // CPU reference
+            auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+                                                                                   InDataType,
+                                                                                   WeiDataType,
+                                                                                   OutDataType,
+                                                                                   InElementOp,
+                                                                                   WeiElementOp,
+                                                                                   OutElementOp>{};
+            auto ref_invoker  = ref_conv.MakeInvoker();
+            auto ref_argument = ref_conv.MakeArgument(input,
+                                                      weight_host_result,
+                                                      output,
+                                                      conv_param.conv_filter_strides_,
+                                                      conv_param.conv_filter_dilations_,
+                                                      conv_param.input_left_pads_,
+                                                      conv_param.input_right_pads_,
+                                                      in_element_op,
+                                                      wei_element_op,
+                                                      out_element_op,
+                                                      {},
+                                                      {},
+                                                      {});
+
+            ref_invoker.Run(ref_argument);
+        }
+        else if(do_verification == 2)
+        {
+            // GPU reference
+            std::cout << "Running GPU reference implementation..." << std::endl;
+
+            // Allocate device memory for reference
+            DeviceMem in_ref_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
+            DeviceMem wei_ref_buf(sizeof(WeiDataType) *
+                                  weight_host_result.mDesc.GetElementSpaceSize());
+            DeviceMem out_ref_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
+
+            in_ref_buf.ToDevice(input.mData.data());
+            out_ref_buf.ToDevice(output.mData.data());
+
+            // Call GPU reference with ConvParam directly
+            ck::ref::naive_conv_bwd_weight<InLayout,
+                                           WeiLayout,
+                                           OutLayout,
+                                           InDataType,
+                                           WeiDataType,
+                                           OutDataType,
+                                           InElementOp,
+                                           WeiElementOp,
+                                           OutElementOp>(
+                static_cast<const InDataType*>(in_ref_buf.GetDeviceBuffer()),
+                static_cast<WeiDataType*>(wei_ref_buf.GetDeviceBuffer()),
+                static_cast<const OutDataType*>(out_ref_buf.GetDeviceBuffer()),
+                conv_param,
+                in_element_op,
+                wei_element_op,
+                out_element_op);
+
+            // Copy result back to host
+            wei_ref_buf.FromDevice(weight_host_result.mData.data());
+        }
 
-        ref_invoker.Run(ref_argument);
         max_accumulated_value =
             *std::max_element(weight_host_result.mData.begin(), weight_host_result.mData.end());
     }
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index 427d2b14df..aeed6f4f06 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -22,6 +22,7 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
 
 namespace ck {
 namespace profiler {
@@ -113,8 +114,38 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
     wei_device_buf.ToDevice(weight.mData.data());
 
     // run reference op
-    if(do_verification)
+    if(do_verification == 2)
     {
+        // Use GPU reference for verification
+        std::cout << "Using GPU reference for verification" << std::endl;
+
+        // Allocate GPU reference output buffer
+        DeviceMem gpu_ref_out_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+
+        // Call GPU reference with ConvParam directly
+        ref::naive_conv_fwd<InLayout,
+                            WeiLayout,
+                            OutLayout,
+                            InDataType,
+                            WeiDataType,
+                            OutDataType,
+                            InElementOp,
+                            WeiElementOp,
+                            OutElementOp>(
+            reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
+            reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            reinterpret_cast<OutDataType*>(gpu_ref_out_buf.GetDeviceBuffer()),
+            conv_param,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        // Copy GPU reference result to host for comparison
+        gpu_ref_out_buf.FromDevice(host_output.mData.data());
+    }
+    else if(do_verification == 1)
+    {
+        // Use CPU reference for verification (default)
         auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                      InDataType,
                                                                      WeiDataType,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 802f29024c..46a58d1aca 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -311,4 +311,5 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx12")
 endif()
 add_subdirectory(position_embedding)
 add_subdirectory(scatter_gather)
+add_subdirectory(gpu_reference)
 add_subdirectory(util)
diff --git a/test/convnd_fwd/CMakeLists.txt b/test/convnd_fwd/CMakeLists.txt
index 3da6ffd164..17374fd131 100644
--- a/test/convnd_fwd/CMakeLists.txt
+++ b/test/convnd_fwd/CMakeLists.txt
@@ -5,3 +5,8 @@ add_gtest_executable(test_convnd_fwd convnd_fwd_xdl.cpp)
 if(result EQUAL 0)
    target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
 endif()
+
+add_gtest_executable(test_convnd_fwd_naive convnd_fwd_naive.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_convnd_fwd_naive PRIVATE utility)
+endif()
diff --git a/test/convnd_fwd/convnd_fwd_naive.cpp b/test/convnd_fwd/convnd_fwd_naive.cpp
new file mode 100644
index 0000000000..b8313f186c
--- /dev/null
+++ b/test/convnd_fwd/convnd_fwd_naive.cpp
@@ -0,0 +1,220 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using InDataType  = float;
+using WeiDataType = float;
+using OutDataType = float;
+using AccDataType = float;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using DeviceConvNaive = ck::tensor_operation::device::
+    DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<InDataType,
+                                                                                 WeiDataType,
+                                                                                 OutDataType,
+                                                                                 AccDataType,
+                                                                                 InElementOp,
+                                                                                 WeiElementOp,
+                                                                                 OutElementOp>;
+
+template <ck::index_t NDimSpatial>
+bool run_conv3d_naive_test(const ck::utils::conv::ConvParam& conv_param)
+{
+    using namespace ck;
+    using namespace ck::tensor_operation::host;
+
+    using InLayout  = ck::tensor_layout::convolution::GNCDHW;
+    using WeiLayout = ck::tensor_layout::convolution::GKCZYX;
+    using OutLayout = ck::tensor_layout::convolution::GNKDHW;
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    // Initialize tensors
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    // Run device kernel - convert long_index_t vectors to index_t
+    std::vector<ck::index_t> input_spatial_lengths(conv_param.input_spatial_lengths_.begin(),
+                                                   conv_param.input_spatial_lengths_.end());
+    std::vector<ck::index_t> filter_spatial_lengths(conv_param.filter_spatial_lengths_.begin(),
+                                                    conv_param.filter_spatial_lengths_.end());
+    auto output_spatial_lengths_long = conv_param.GetOutputSpatialLengths();
+    std::vector<ck::index_t> output_spatial_lengths(output_spatial_lengths_long.begin(),
+                                                    output_spatial_lengths_long.end());
+    std::vector<ck::index_t> conv_filter_strides(conv_param.conv_filter_strides_.begin(),
+                                                 conv_param.conv_filter_strides_.end());
+    std::vector<ck::index_t> conv_filter_dilations(conv_param.conv_filter_dilations_.begin(),
+                                                   conv_param.conv_filter_dilations_.end());
+    std::vector<ck::index_t> input_left_pads(conv_param.input_left_pads_.begin(),
+                                             conv_param.input_left_pads_.end());
+    std::vector<ck::index_t> input_right_pads(conv_param.input_right_pads_.begin(),
+                                              conv_param.input_right_pads_.end());
+
+    auto conv    = DeviceConvNaive{};
+    auto invoker = conv.MakeInvoker();
+    auto argument =
+        conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
+                          static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                          static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                          conv_param.N_,
+                          conv_param.K_,
+                          conv_param.C_,
+                          input_spatial_lengths,
+                          filter_spatial_lengths,
+                          output_spatial_lengths,
+                          conv_filter_strides,
+                          conv_filter_dilations,
+                          input_left_pads,
+                          input_right_pads,
+                          InElementOp{},
+                          WeiElementOp{},
+                          OutElementOp{});
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cout << "Unsupported argument for naive conv3d kernel" << std::endl;
+        return false;
+    }
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    // Run CPU reference
+    auto ref_conv = ReferenceConvFwd<NDimSpatial,
+                                     InDataType,
+                                     WeiDataType,
+                                     OutDataType,
+                                     InElementOp,
+                                     WeiElementOp,
+                                     OutElementOp,
+                                     0,
+                                     0,
+                                     0,
+                                     AccDataType>();
+
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(in,
+                                              wei,
+                                              out_host,
+                                              conv_param.conv_filter_strides_,
+                                              conv_param.conv_filter_dilations_,
+                                              conv_param.input_left_pads_,
+                                              conv_param.input_right_pads_,
+                                              InElementOp{},
+                                              WeiElementOp{},
+                                              OutElementOp{});
+
+    ref_invoker.Run(ref_argument);
+
+    // Compare results
+    out_device_buf.FromDevice(out_device.mData.data());
+
+    return ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-3, 1e-3);
+}
+
+TEST(TestConv3dNaive, Conv3dNaive_Small)
+{
+    // Small 3D convolution test
+    ck::utils::conv::ConvParam param{
+        3,         // spatial_dim
+        1,         // G
+        2,         // N
+        16,        // K
+        16,        // C
+        {3, 3, 3}, // filter
+        {7, 7, 7}, // input spatial
+        {2, 2, 2}, // strides
+        {1, 1, 1}, // dilations
+        {1, 1, 1}, // left pads
+        {1, 1, 1}  // right pads
+    };
+
+    bool pass = run_conv3d_naive_test<3>(param);
+    EXPECT_TRUE(pass);
+}
+
+TEST(TestConv3dNaive, Conv3dNaive_Medium)
+{
+    // Medium size 3D convolution test
+    ck::utils::conv::ConvParam param{
+        3,            // spatial_dim
+        1,            // G
+        4,            // N
+        32,           // K
+        32,           // C
+        {3, 3, 3},    // filter
+        {14, 14, 14}, // input spatial
+        {1, 1, 1},    // strides
+        {1, 1, 1},    // dilations
+        {1, 1, 1},    // left pads
+        {1, 1, 1}     // right pads
+    };
+
+    bool pass = run_conv3d_naive_test<3>(param);
+    EXPECT_TRUE(pass);
+}
+
+TEST(TestConv3dNaive, Conv3dNaive_UnitFilter)
+{
+    // 1x1x1 filter (no padding)
+    ck::utils::conv::ConvParam param{
+        3,         // spatial_dim
+        1,         // G
+        2,         // N
+        24,        // K
+        24,        // C
+        {1, 1, 1}, // filter
+        {8, 8, 8}, // input spatial
+        {1, 1, 1}, // strides
+        {1, 1, 1}, // dilations
+        {0, 0, 0}, // left pads
+        {0, 0, 0}  // right pads
+    };
+
+    bool pass = run_conv3d_naive_test<3>(param);
+    EXPECT_TRUE(pass);
+}
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gpu_reference/CMakeLists.txt b/test/gpu_reference/CMakeLists.txt
new file mode 100644
index 0000000000..443818feb3
--- /dev/null
+++ b/test/gpu_reference/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+add_gtest_executable(test_gpu_reference_conv_fwd test_gpu_reference_conv_fwd.cpp)
+target_link_libraries(test_gpu_reference_conv_fwd PRIVATE utility)
+
+add_gtest_executable(test_gpu_reference_conv_bwd_data test_gpu_reference_conv_bwd_data.cpp)
+target_link_libraries(test_gpu_reference_conv_bwd_data PRIVATE utility)
+
+add_gtest_executable(test_gpu_reference_conv_bwd_weight test_gpu_reference_conv_bwd_weight.cpp)
+target_link_libraries(test_gpu_reference_conv_bwd_weight PRIVATE utility)
diff --git a/test/gpu_reference/common_test_params.hpp b/test/gpu_reference/common_test_params.hpp
new file mode 100644
index 0000000000..bcd638d0cb
--- /dev/null
+++ b/test/gpu_reference/common_test_params.hpp
@@ -0,0 +1,137 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include <vector>
+
+namespace ck {
+namespace test {
+
+// Common test shapes for all convolution tests (fwd, bwd_data, bwd_weight)
+namespace conv_test_shapes {
+
+// 2D Conv, FP16, Small
+inline ck::utils::conv::ConvParam get_2d_small()
+{
+    return ck::utils::conv::ConvParam(2,      // num_dim_spatial
+                                      1,      // G
+                                      2,      // N
+                                      8,      // K
+                                      8,      // C
+                                      {3, 3}, // filter_spatial
+                                      {7, 7}, // input_spatial
+                                      {1, 1}, // strides
+                                      {1, 1}, // dilations
+                                      {0, 0}, // left_pads
+                                      {0, 0}  // right_pads
+    );
+}
+
+// 2D Conv, FP32, Medium
+inline ck::utils::conv::ConvParam get_2d_medium()
+{
+    return ck::utils::conv::ConvParam(2,        // num_dim_spatial
+                                      1,        // G
+                                      4,        // N
+                                      16,       // K
+                                      16,       // C
+                                      {3, 3},   // filter_spatial
+                                      {14, 14}, // input_spatial
+                                      {1, 1},   // strides
+                                      {1, 1},   // dilations
+                                      {0, 0},   // left_pads
+                                      {0, 0}    // right_pads
+    );
+}
+
+// 1D Conv, FP16
+inline ck::utils::conv::ConvParam get_1d()
+{
+    return ck::utils::conv::ConvParam(1,    // num_dim_spatial
+                                      1,    // G
+                                      2,    // N
+                                      8,    // K
+                                      8,    // C
+                                      {3},  // filter_spatial
+                                      {16}, // input_spatial
+                                      {1},  // strides
+                                      {1},  // dilations
+                                      {0},  // left_pads
+                                      {0}   // right_pads
+    );
+}
+
+// 3D Conv, FP16, Small
+inline ck::utils::conv::ConvParam get_3d_small()
+{
+    return ck::utils::conv::ConvParam(3,         // num_dim_spatial
+                                      1,         // G
+                                      1,         // N
+                                      8,         // K
+                                      8,         // C
+                                      {3, 3, 3}, // filter_spatial
+                                      {5, 5, 5}, // input_spatial
+                                      {1, 1, 1}, // strides
+                                      {1, 1, 1}, // dilations
+                                      {0, 0, 0}, // left_pads
+                                      {0, 0, 0}  // right_pads
+    );
+}
+
+// 2D Conv with stride
+inline ck::utils::conv::ConvParam get_2d_stride2()
+{
+    return ck::utils::conv::ConvParam(2,      // num_dim_spatial
+                                      1,      // G
+                                      2,      // N
+                                      8,      // K
+                                      8,      // C
+                                      {3, 3}, // filter_spatial
+                                      {8, 8}, // input_spatial
+                                      {2, 2}, // strides
+                                      {1, 1}, // dilations
+                                      {0, 0}, // left_pads
+                                      {0, 0}  // right_pads
+    );
+}
+
+// 2D Grouped Conv, FP16, G=2
+inline ck::utils::conv::ConvParam get_2d_grouped_g2()
+{
+    return ck::utils::conv::ConvParam(2,      // num_dim_spatial
+                                      2,      // G
+                                      2,      // N
+                                      8,      // K (8 total output channels)
+                                      16,     // C (16 total input channels, 8 per group with G=2)
+                                      {3, 3}, // filter_spatial
+                                      {7, 7}, // input_spatial
+                                      {1, 1}, // strides
+                                      {1, 1}, // dilations
+                                      {0, 0}, // left_pads
+                                      {0, 0}  // right_pads
+    );
+}
+
+// 2D Grouped Conv, FP32, G=4
+inline ck::utils::conv::ConvParam get_2d_grouped_g4()
+{
+    return ck::utils::conv::ConvParam(2,      // num_dim_spatial
+                                      4,      // G
+                                      1,      // N
+                                      16,     // K (16 total output channels)
+                                      16,     // C (16 total input channels, 4 per group with G=4)
+                                      {3, 3}, // filter_spatial
+                                      {8, 8}, // input_spatial
+                                      {1, 1}, // strides
+                                      {1, 1}, // dilations
+                                      {0, 0}, // left_pads
+                                      {0, 0}  // right_pads
+    );
+}
+
+} // namespace conv_test_shapes
+} // namespace test
+} // namespace ck
diff --git a/test/gpu_reference/gpu_reference_utils.hpp b/test/gpu_reference/gpu_reference_utils.hpp
new file mode 100644
index 0000000000..fc017c8734
--- /dev/null
+++ b/test/gpu_reference/gpu_reference_utils.hpp
@@ -0,0 +1,385 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+// CPU references
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
+
+// GPU references
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
+#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "common_test_params.hpp"
+
+namespace ck {
+namespace test {
+
+enum class ConvKernelType
+{
+    Forward,
+    BackwardData,
+    BackwardWeight
+};
+
+// Helper function to initialize and copy a tensor to device
+template <typename DataType>
+void initialize_and_copy_tensor(Tensor<DataType>& host_tensor, DeviceMem& device_mem)
+{
+    host_tensor.GenerateTensorValue(GeneratorTensor_2<DataType>{-5, 5});
+    device_mem.ToDevice(host_tensor.mData.data());
+}
+
+// Helper to get default layout types based on NDimSpatial
+template <index_t NDimSpatial>
+struct DefaultConvLayouts
+{
+    using InLayout  = std::conditional_t<NDimSpatial == 3,
+                                         tensor_layout::convolution::GNCDHW,
+                                         std::conditional_t<NDimSpatial == 2,
+                                                            tensor_layout::convolution::GNCHW,
+                                                            tensor_layout::convolution::GNCW>>;
+    using WeiLayout = std::conditional_t<NDimSpatial == 3,
+                                         tensor_layout::convolution::GKCZYX,
+                                         std::conditional_t<NDimSpatial == 2,
+                                                            tensor_layout::convolution::GKCYX,
+                                                            tensor_layout::convolution::GKCX>>;
+    using OutLayout = std::conditional_t<NDimSpatial == 3,
+                                         tensor_layout::convolution::GNKDHW,
+                                         std::conditional_t<NDimSpatial == 2,
+                                                            tensor_layout::convolution::GNKHW,
+                                                            tensor_layout::convolution::GNKW>>;
+};
+
+// Forward convolution implementation
+template <index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool test_conv_fwd_impl(const ck::utils::conv::ConvParam& params,
+                        const Tensor<InDataType>& input_cpu,
+                        const Tensor<WeiDataType>& weight_cpu,
+                        DeviceMem& input_dev,
+                        DeviceMem& weight_dev,
+                        DeviceMem& output_dev)
+{
+    using InElementOp  = tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = tensor_operation::element_wise::PassThrough;
+    using OutElementOp = tensor_operation::element_wise::PassThrough;
+
+    // Call GPU reference with ConvParam directly
+    ref::naive_conv_fwd<InLayout,
+                        WeiLayout,
+                        OutLayout,
+                        InDataType,
+                        WeiDataType,
+                        OutDataType,
+                        InElementOp,
+                        WeiElementOp,
+                        OutElementOp>(
+        reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
+        reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
+        reinterpret_cast<OutDataType*>(output_dev.GetDeviceBuffer()),
+        params);
+
+    HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+    // Run CPU reference
+    std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
+                                           params.conv_filter_strides_.end());
+    std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
+                                             params.conv_filter_dilations_.end());
+    std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
+                                        params.input_left_pads_.end());
+
+    Tensor<InDataType> input_ref   = input_cpu;
+    Tensor<WeiDataType> weight_ref = weight_cpu;
+    Tensor<OutDataType> output_ref(
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params));
+
+    auto ref_conv    = tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                InDataType,
+                                                                WeiDataType,
+                                                                OutDataType,
+                                                                InElementOp,
+                                                                WeiElementOp,
+                                                                OutElementOp>();
+    auto ref_invoker = ref_conv.MakeInvoker();
+    auto ref_arg     = ref_conv.MakeArgument(input_ref,
+                                         weight_ref,
+                                         output_ref,
+                                         strides_long,
+                                         dilations_long,
+                                         pads_long,
+                                         pads_long,
+                                         InElementOp{},
+                                         WeiElementOp{},
+                                         OutElementOp{});
+    ref_invoker.Run(ref_arg);
+
+    // Copy result from device and compare
+    Tensor<OutDataType> output_gpu(output_ref.mDesc);
+    output_dev.FromDevice(output_gpu.mData.data());
+    HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+    // Compare results
+    return ck::utils::check_err(output_gpu, output_ref);
+}
+
+// Backward data convolution implementation
+template <index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool test_conv_bwd_data_impl(const ck::utils::conv::ConvParam& params,
+                             const Tensor<WeiDataType>& weight_cpu,
+                             const Tensor<OutDataType>& output_cpu,
+                             DeviceMem& weight_dev,
+                             DeviceMem& output_dev,
+                             DeviceMem& input_dev)
+{
+    using InElementOp  = tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = tensor_operation::element_wise::PassThrough;
+    using OutElementOp = tensor_operation::element_wise::PassThrough;
+
+    // Call GPU reference with ConvParam directly
+    ref::naive_conv_bwd_data<InLayout,
+                             WeiLayout,
+                             OutLayout,
+                             InDataType,
+                             WeiDataType,
+                             OutDataType,
+                             InElementOp,
+                             WeiElementOp,
+                             OutElementOp>(
+        reinterpret_cast<InDataType*>(input_dev.GetDeviceBuffer()),
+        reinterpret_cast<const WeiDataType*>(weight_dev.GetDeviceBuffer()),
+        reinterpret_cast<const OutDataType*>(output_dev.GetDeviceBuffer()),
+        params);
+
+    HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+    // Run CPU reference
+    std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
+                                           params.conv_filter_strides_.end());
+    std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
+                                             params.conv_filter_dilations_.end());
+    std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
+                                        params.input_left_pads_.end());
+
+    Tensor<InDataType> input_ref(
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(params));
+    Tensor<WeiDataType> weight_ref = weight_cpu;
+    Tensor<OutDataType> output_ref = output_cpu;
+
+    auto ref_conv    = tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    OutDataType,
+                                                                    InElementOp,
+                                                                    WeiElementOp,
+                                                                    OutElementOp>();
+    auto ref_invoker = ref_conv.MakeInvoker();
+    auto ref_arg     = ref_conv.MakeArgument(input_ref,
+                                         weight_ref,
+                                         output_ref,
+                                         strides_long,
+                                         dilations_long,
+                                         pads_long,
+                                         pads_long,
+                                         InElementOp{},
+                                         WeiElementOp{},
+                                         OutElementOp{});
+    ref_invoker.Run(ref_arg);
+
+    // Copy result from device and compare
+    Tensor<InDataType> input_gpu(input_ref.mDesc);
+    input_dev.FromDevice(input_gpu.mData.data());
+    HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+    // Compare results
+    return ck::utils::check_err(input_gpu, input_ref);
+}
+
+// Backward weight convolution implementation
+template <index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool test_conv_bwd_weight_impl(const ck::utils::conv::ConvParam& params,
+                               const Tensor<InDataType>& input_cpu,
+                               const Tensor<OutDataType>& output_cpu,
+                               DeviceMem& input_dev,
+                               DeviceMem& output_dev,
+                               DeviceMem& weight_dev)
+{
+    using InElementOp  = tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = tensor_operation::element_wise::PassThrough;
+    using OutElementOp = tensor_operation::element_wise::PassThrough;
+
+    // Call GPU reference with ConvParam directly
+    ref::naive_conv_bwd_weight<InLayout,
+                               WeiLayout,
+                               OutLayout,
+                               InDataType,
+                               WeiDataType,
+                               OutDataType,
+                               InElementOp,
+                               WeiElementOp,
+                               OutElementOp>(
+        reinterpret_cast<const InDataType*>(input_dev.GetDeviceBuffer()),
+        reinterpret_cast<WeiDataType*>(weight_dev.GetDeviceBuffer()),
+        reinterpret_cast<const OutDataType*>(output_dev.GetDeviceBuffer()),
+        params);
+
+    HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+    // Run CPU reference
+    std::vector<long_index_t> strides_long(params.conv_filter_strides_.begin(),
+                                           params.conv_filter_strides_.end());
+    std::vector<long_index_t> dilations_long(params.conv_filter_dilations_.begin(),
+                                             params.conv_filter_dilations_.end());
+    std::vector<long_index_t> pads_long(params.input_left_pads_.begin(),
+                                        params.input_left_pads_.end());
+
+    Tensor<InDataType> input_ref = input_cpu;
+    Tensor<WeiDataType> weight_ref(
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(params));
+    Tensor<OutDataType> output_ref = output_cpu;
+
+    auto ref_conv    = tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+                                                                      InDataType,
+                                                                      WeiDataType,
+                                                                      OutDataType,
+                                                                      InElementOp,
+                                                                      WeiElementOp,
+                                                                      OutElementOp>();
+    auto ref_invoker = ref_conv.MakeInvoker();
+    auto ref_arg     = ref_conv.MakeArgument(input_ref,
+                                         weight_ref,
+                                         output_ref,
+                                         strides_long,
+                                         dilations_long,
+                                         pads_long,
+                                         pads_long,
+                                         InElementOp{},
+                                         WeiElementOp{},
+                                         OutElementOp{});
+    ref_invoker.Run(ref_arg);
+
+    // Copy result from device and compare
+    Tensor<WeiDataType> weight_gpu(weight_ref.mDesc);
+    weight_dev.FromDevice(weight_gpu.mData.data());
+    HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+    // Compare results
+    return ck::utils::check_err(weight_gpu, weight_ref);
+}
+
+// Main test function - dispatches to specific implementations
+template <index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout  = typename DefaultConvLayouts<NDimSpatial>::InLayout,
+          typename WeiLayout = typename DefaultConvLayouts<NDimSpatial>::WeiLayout,
+          typename OutLayout = typename DefaultConvLayouts<NDimSpatial>::OutLayout>
+bool test_conv_gpu_ref(const ck::utils::conv::ConvParam& params, ConvKernelType kernel_type)
+{
+    // Create tensor descriptors using the specified layouts
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(params);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(params);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(params);
+
+    // Create tensors using tensor descriptors (supports multiple layouts)
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    // Allocate device memory
+    DeviceMem input_dev(input.mData.size() * sizeof(InDataType));
+    DeviceMem weight_dev(weight.mData.size() * sizeof(WeiDataType));
+    DeviceMem output_dev(output.mData.size() * sizeof(OutDataType));
+
+    // Initialize and copy tensors based on kernel type
+    if(kernel_type == ConvKernelType::Forward)
+    {
+        initialize_and_copy_tensor(input, input_dev);
+        initialize_and_copy_tensor(weight, weight_dev);
+    }
+    else if(kernel_type == ConvKernelType::BackwardData)
+    {
+        initialize_and_copy_tensor(weight, weight_dev);
+        initialize_and_copy_tensor(output, output_dev);
+    }
+    else // BackwardWeight
+    {
+        initialize_and_copy_tensor(input, input_dev);
+        initialize_and_copy_tensor(output, output_dev);
+    }
+
+    // Dispatch to appropriate implementation with layout types
+    if(kernel_type == ConvKernelType::Forward)
+    {
+        return test_conv_fwd_impl<NDimSpatial,
+                                  InDataType,
+                                  WeiDataType,
+                                  OutDataType,
+                                  InLayout,
+                                  WeiLayout,
+                                  OutLayout>(
+            params, input, weight, input_dev, weight_dev, output_dev);
+    }
+    else if(kernel_type == ConvKernelType::BackwardData)
+    {
+        return test_conv_bwd_data_impl<NDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(
+            params, weight, output, weight_dev, output_dev, input_dev);
+    }
+    else // BackwardWeight
+    {
+        return test_conv_bwd_weight_impl<NDimSpatial,
+                                         InDataType,
+                                         WeiDataType,
+                                         OutDataType,
+                                         InLayout,
+                                         WeiLayout,
+                                         OutLayout>(
+            params, input, output, input_dev, output_dev, weight_dev);
+    }
+}
+
+} // namespace test
+} // namespace ck
diff --git a/test/gpu_reference/test_gpu_reference_conv_bwd_data.cpp b/test/gpu_reference/test_gpu_reference_conv_bwd_data.cpp
new file mode 100644
index 0000000000..0d69a9e77b
--- /dev/null
+++ b/test/gpu_reference/test_gpu_reference_conv_bwd_data.cpp
@@ -0,0 +1,224 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+#include "gpu_reference_utils.hpp"
+
+using namespace ck;
+using ck::test::ConvKernelType;
+
+TEST(GpuReferenceConvBwdData, Conv2DFP16Small)
+{
+    auto params = test::conv_test_shapes::get_2d_small();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP32Medium)
+{
+    auto params = test::conv_test_shapes::get_2d_medium();
+    bool result =
+        test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv1DFP16)
+{
+    auto params = test::conv_test_shapes::get_1d();
+    bool result =
+        test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv3DFP16Small)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    bool result =
+        test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP16Stride2)
+{
+    auto params = test::conv_test_shapes::get_2d_stride2();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedG2)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedG4)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g4();
+    bool result =
+        test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NHWGC,
+                                          tensor_layout::convolution::GKYXC,
+                                          tensor_layout::convolution::NHWGK>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NHWGC,
+                                          tensor_layout::convolution::GKYXC,
+                                          tensor_layout::convolution::NHWGK>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NGCHW,
+                                          tensor_layout::convolution::GKYXC,
+                                          tensor_layout::convolution::NGKHW>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NGCHW,
+                                          tensor_layout::convolution::GKYXC,
+                                          tensor_layout::convolution::NGKHW>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NGCHW,
+                                          tensor_layout::convolution::GKCYX,
+                                          tensor_layout::convolution::NGKHW>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NGCHW,
+                                          tensor_layout::convolution::GKCYX,
+                                          tensor_layout::convolution::NGKHW>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NDHWGC,
+                                          tensor_layout::convolution::GKZYXC,
+                                          tensor_layout::convolution::NDHWGK>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NDHWGC,
+                                          tensor_layout::convolution::GKZYXC,
+                                          tensor_layout::convolution::NDHWGK>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NGCDHW,
+                                          tensor_layout::convolution::GKCZYX,
+                                          tensor_layout::convolution::NGKDHW>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdData, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NGCDHW,
+                                          tensor_layout::convolution::GKCZYX,
+                                          tensor_layout::convolution::NGKDHW>(
+        params, ConvKernelType::BackwardData);
+    EXPECT_TRUE(result);
+}
diff --git a/test/gpu_reference/test_gpu_reference_conv_bwd_weight.cpp b/test/gpu_reference/test_gpu_reference_conv_bwd_weight.cpp
new file mode 100644
index 0000000000..b373052a0f
--- /dev/null
+++ b/test/gpu_reference/test_gpu_reference_conv_bwd_weight.cpp
@@ -0,0 +1,224 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+#include "gpu_reference_utils.hpp"
+
+using namespace ck;
+using ck::test::ConvKernelType;
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP16Small)
+{
+    auto params = test::conv_test_shapes::get_2d_small();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP32Medium)
+{
+    auto params = test::conv_test_shapes::get_2d_medium();
+    bool result =
+        test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv1DFP16)
+{
+    auto params = test::conv_test_shapes::get_1d();
+    bool result =
+        test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv3DFP16Small)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    bool result =
+        test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP16Stride2)
+{
+    auto params = test::conv_test_shapes::get_2d_stride2();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedG2)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedG4)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g4();
+    bool result =
+        test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NHWGC,
+                                          tensor_layout::convolution::GKYXC,
+                                          tensor_layout::convolution::NHWGK>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NHWGC,
+                                          tensor_layout::convolution::GKYXC,
+                                          tensor_layout::convolution::NHWGK>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NGCHW,
+                                          tensor_layout::convolution::GKYXC,
+                                          tensor_layout::convolution::NGKHW>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NGCHW,
+                                          tensor_layout::convolution::GKYXC,
+                                          tensor_layout::convolution::NGKHW>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NGCHW,
+                                          tensor_layout::convolution::GKCYX,
+                                          tensor_layout::convolution::NGKHW>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result = test::test_conv_gpu_ref<2,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NGCHW,
+                                          tensor_layout::convolution::GKCYX,
+                                          tensor_layout::convolution::NGKHW>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NDHWGC,
+                                          tensor_layout::convolution::GKZYXC,
+                                          tensor_layout::convolution::NDHWGK>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NDHWGC,
+                                          tensor_layout::convolution::GKZYXC,
+                                          tensor_layout::convolution::NDHWGK>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NGCDHW,
+                                          tensor_layout::convolution::GKCZYX,
+                                          tensor_layout::convolution::NGKDHW>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvBwdWeight, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NGCDHW,
+                                          tensor_layout::convolution::GKCZYX,
+                                          tensor_layout::convolution::NGKDHW>(
+        params, ConvKernelType::BackwardWeight);
+    EXPECT_TRUE(result);
+}
diff --git a/test/gpu_reference/test_gpu_reference_conv_fwd.cpp b/test/gpu_reference/test_gpu_reference_conv_fwd.cpp
new file mode 100644
index 0000000000..1182922a58
--- /dev/null
+++ b/test/gpu_reference/test_gpu_reference_conv_fwd.cpp
@@ -0,0 +1,222 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+#include "gpu_reference_utils.hpp"
+
+using namespace ck;
+using ck::test::ConvKernelType;
+
+TEST(GpuReferenceConvFwd, Conv2DFP16Small)
+{
+    auto params = test::conv_test_shapes::get_2d_small();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP32Medium)
+{
+    auto params = test::conv_test_shapes::get_2d_medium();
+    bool result = test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv1DFP16)
+{
+    auto params = test::conv_test_shapes::get_1d();
+    bool result =
+        test::test_conv_gpu_ref<1, half_t, half_t, half_t>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv3DFP16Small)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    bool result =
+        test::test_conv_gpu_ref<3, half_t, half_t, half_t>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP16Stride2)
+{
+    auto params = test::conv_test_shapes::get_2d_stride2();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP16GroupedG2)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2, half_t, half_t, half_t>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP32GroupedG4)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g4();
+    bool result = test::test_conv_gpu_ref<2, float, float, float>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNHWGC_GKYXC_NHWGK)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2,
+                                float,
+                                float,
+                                float,
+                                tensor_layout::convolution::NHWGC,
+                                tensor_layout::convolution::GKYXC,
+                                tensor_layout::convolution::NHWGK>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNHWGC_GKYXC_NHWGK)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2,
+                                half_t,
+                                half_t,
+                                half_t,
+                                tensor_layout::convolution::NHWGC,
+                                tensor_layout::convolution::GKYXC,
+                                tensor_layout::convolution::NHWGK>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNGCHW_GKYXC_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2,
+                                float,
+                                float,
+                                float,
+                                tensor_layout::convolution::NGCHW,
+                                tensor_layout::convolution::GKYXC,
+                                tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNGCHW_GKYXC_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2,
+                                half_t,
+                                half_t,
+                                half_t,
+                                tensor_layout::convolution::NGCHW,
+                                tensor_layout::convolution::GKYXC,
+                                tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP32GroupedNGCHW_GKCYX_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2,
+                                float,
+                                float,
+                                float,
+                                tensor_layout::convolution::NGCHW,
+                                tensor_layout::convolution::GKCYX,
+                                tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv2DFP16GroupedNGCHW_GKCYX_NGKHW)
+{
+    auto params = test::conv_test_shapes::get_2d_grouped_g2();
+    bool result =
+        test::test_conv_gpu_ref<2,
+                                half_t,
+                                half_t,
+                                half_t,
+                                tensor_layout::convolution::NGCHW,
+                                tensor_layout::convolution::GKCYX,
+                                tensor_layout::convolution::NGKHW>(params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv3DFP32GroupedNDHWGC_GKZYXC_NDHWGK)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NDHWGC,
+                                          tensor_layout::convolution::GKZYXC,
+                                          tensor_layout::convolution::NDHWGK>(
+        params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv3DFP16GroupedNDHWGC_GKZYXC_NDHWGK)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NDHWGC,
+                                          tensor_layout::convolution::GKZYXC,
+                                          tensor_layout::convolution::NDHWGK>(
+        params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv3DFP32GroupedNGCDHW_GKCZYX_NGKDHW)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          float,
+                                          float,
+                                          float,
+                                          tensor_layout::convolution::NGCDHW,
+                                          tensor_layout::convolution::GKCZYX,
+                                          tensor_layout::convolution::NGKDHW>(
+        params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
+
+TEST(GpuReferenceConvFwd, Conv3DFP16GroupedNGCDHW_GKCZYX_NGKDHW)
+{
+    auto params = test::conv_test_shapes::get_3d_small();
+    // Modify to be grouped (G=2)
+    params.G_ = 2;
+    params.C_ = 16; // 8 per group
+    params.K_ = 16; // 8 per group
+
+    bool result = test::test_conv_gpu_ref<3,
+                                          half_t,
+                                          half_t,
+                                          half_t,
+                                          tensor_layout::convolution::NGCDHW,
+                                          tensor_layout::convolution::GKCZYX,
+                                          tensor_layout::convolution::NGKDHW>(
+        params, ConvKernelType::Forward);
+    EXPECT_TRUE(result);
+}
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp
index 53b8ec32af..a30f5d349d 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_dataset_xdl.cpp
@@ -73,7 +73,7 @@ bool RunConvBwdDataTest(const ck::utils::conv::ConvParam& param, ck::index_t spl
                                                             InLayout,
                                                             DataType,
                                                             DataType,
-                                                            DataType>(true,    // do_verification
+                                                            DataType>(2,       // do_verification
                                                                       1,       // init_method
                                                                       false,   // do_log
                                                                       false,   // time_kernel
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index 14ed1b8939..efedf416f0 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -47,7 +47,7 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test
                                                                                        DataType,
                                                                                        DataType,
                                                                                        DataType>(
-                                   true,  // do_verification
+                                   2,     // do_verification
                                    1,     // init_method: integer value
                                    false, // do_log
                                    false, // time_kernel
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
index 4b5e38dea6..5d56615834 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -73,7 +73,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                                                                                       InDataType,
                                                                                       WeiDataType,
                                                                                       OutDataType>(
-                                       true,  // do_verification
+                                       2,     // do_verification
                                        1,     // init_method: integer value
                                        false, // do_log
                                        false, // time_kernel
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp
index aff6ba8873..07d80dfad2 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_dataset_xdl.cpp
@@ -80,7 +80,7 @@ bool RunConvBwdWeightTest(const ck::utils::conv::ConvParam& param, ck::index_t s
                                                               InDataType,
                                                               WeiDataType,
                                                               OutDataType>(
-        true,                    // do_verification
+        2,                       // do_verification
         1,                       // init_method
         false,                   // do_log
         false,                   // time_kernel
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
index b5c5248df5..e1207b1133 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -46,7 +46,7 @@ class TestGroupedConvndFwd : public ::testing::Test
                                                                               DataType,
                                                                               DataType,
                                                                               IndexType>(
-                               true,  // do_verification
+                               2,     // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
index c99f7ccf2f..8bfdbabd54 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
@@ -77,7 +77,7 @@ bool RunConvTest(const ck::utils::conv::ConvParam& param)
                                                        DataType,
                                                        DataType,
                                                        DataType,
-                                                       IndexType>(true,  // do_verification
+                                                       IndexType>(2,     // do_verification
                                                                   1,     // init_method
                                                                   false, // do_log
                                                                   false, // time_kernel