diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp
index eb8b5c76d3..9e125c4e5d 100644
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
     const std::array<int, 2> reduceDims = {3, 4};
     // const std::array<int, 3> invariantDims = {0, 1, 2};
 
-    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
+    std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
 
     // input lengths of the second reduction, which is also the output lengths of the first
     // reduction
-    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
+    std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
 
-    const std::vector<size_t> outLengths = {64, 320, 80};
+    std::vector<size_t> outLengths = {64, 320, 80};
 
     if(argc == 1)
     {
@@ -114,11 +114,26 @@ int main(int argc, char* argv[])
         init_method = 2;
         time_kernel = true;
     }
-    else if(argc == 4)
+    else if((argc == 4) || (argc == 9))
     {
         do_verify   = static_cast<bool>(argv[1]);
         init_method = atoi(argv[2]);
         time_kernel = static_cast<bool>(atoi(argv[3]));
+        if(argc == 9)
+        {
+            inLengths_1[0] = atoi(argv[4]);
+            inLengths_1[1] = atoi(argv[5]);
+            inLengths_1[2] = atoi(argv[6]);
+            inLengths_1[3] = atoi(argv[7]);
+            inLengths_1[4] = atoi(argv[8]);
+            inLengths_2[0] = inLengths_1[0];
+            inLengths_2[1] = inLengths_1[1];
+            inLengths_2[2] = inLengths_1[2];
+            inLengths_2[3] = inLengths_1[3];
+            outLengths[0]  = inLengths_1[0];
+            outLengths[1]  = inLengths_1[1];
+            outLengths[2]  = inLengths_1[2];
+        }
     }
     else
     {
diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
index 26a03f289d..a1b952259f 100644
--- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
@@ -50,14 +50,14 @@ template<> struct emb_kernel<ck::half_t, 8192> { using kernel_type = DeviceInsta
 
 // clang-format on
 
-int main()
+int main(int argc, char* argv[])
 {
     bool time_kernel = true;
 
-    constexpr auto num_rows = 65536;
-    constexpr auto dims     = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{};
-    // constexpr auto dims = ck::Sequence<256, 512>{};
-    constexpr auto index_length   = 2048;
+    ck::index_t num_rows          = 65536;
+    constexpr auto dims           = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{};
+    ck::index_t index_length      = 2048;
+    ck::index_t dim_mask          = 0xffff;
     constexpr AccDataType epsilon = 1e-4;
 
     auto f_host_tensor_desc_1d = [](std::size_t len_) { return HostTensorDescriptor({len_}); };
@@ -73,121 +73,140 @@ int main()
                                                                               BetaDataType,
                                                                               AccDataType,
                                                                               OutType>;
-
+    if(argc == 1)
+    {
+        // Use default value
+    }
+    else if(argc == 4)
+    {
+        num_rows     = atoi(argv[1]);
+        dim_mask     = strtol(argv[2], nullptr, 0);
+        index_length = atoi(argv[3]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1-3: num_rows dim_mask index_length" << std::endl;
+    }
     ck::static_for<0, dims.Size(), 1>{}([&](auto I) {
-        std::srand(std::time(nullptr));
-        constexpr auto current_dim = dims.At(I);
-        Tensor<EmbType> emb_a(f_host_tensor_desc_2d(num_rows, current_dim));
-        Tensor<EmbType> emb_b(f_host_tensor_desc_2d(num_rows, current_dim));
-        Tensor<EmbType> emb_c(f_host_tensor_desc_2d(num_rows, current_dim));
-
-        Tensor<IndexType> index_a(f_host_tensor_desc_1d(index_length));
-        Tensor<IndexType> index_b(f_host_tensor_desc_1d(index_length));
-        Tensor<IndexType> index_c(f_host_tensor_desc_1d(index_length));
-
-        Tensor<GammaDataType> gamma(f_host_tensor_desc_1d(current_dim));
-        Tensor<BetaDataType> beta(f_host_tensor_desc_1d(current_dim));
-
-        Tensor<OutType> out(f_host_tensor_desc_2d(index_length, current_dim));
-
-        emb_a.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-        emb_b.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-        emb_c.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-
-        index_a.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-        index_b.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-        index_c.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-
-        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
-        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
-
-        DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize());
-        DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize());
-        DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize());
-
-        DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize());
-        DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize());
-        DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize());
-
-        DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
-        DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
-
-        DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize());
-
-        emb_a_dev.ToDevice(emb_a.mData.data());
-        emb_b_dev.ToDevice(emb_b.mData.data());
-        emb_c_dev.ToDevice(emb_c.mData.data());
-
-        index_a_dev.ToDevice(index_a.mData.data());
-        index_b_dev.ToDevice(index_b.mData.data());
-        index_c_dev.ToDevice(index_c.mData.data());
-
-        gamma_dev.ToDevice(gamma.mData.data());
-        beta_dev.ToDevice(beta.mData.data());
-
-        auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
-        auto argument_ptr    = device_instance.MakeArgumentPointer(
-            out_dev.GetDeviceBuffer(),
-            {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
-                ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
-                ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
-            {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
-                ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
-                ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
-            gamma_dev.GetDeviceBuffer(),
-            beta_dev.GetDeviceBuffer(),
-            current_dim,
-            index_length,
-            epsilon,
-            EmbElementwiseOperation{});
-        std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
-                  << std::endl
-                  << std::flush;
-
-        bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get());
-
-        if(!is_supported)
+        if(dim_mask & (1 << I.value))
         {
-            std::cout << "Runtime parameters are not supported" << std::endl;
-            return;
+            std::srand(std::time(nullptr));
+            constexpr auto current_dim = dims.At(I);
+            Tensor<EmbType> emb_a(f_host_tensor_desc_2d(num_rows, current_dim));
+            Tensor<EmbType> emb_b(f_host_tensor_desc_2d(num_rows, current_dim));
+            Tensor<EmbType> emb_c(f_host_tensor_desc_2d(num_rows, current_dim));
+
+            Tensor<IndexType> index_a(f_host_tensor_desc_1d(index_length));
+            Tensor<IndexType> index_b(f_host_tensor_desc_1d(index_length));
+            Tensor<IndexType> index_c(f_host_tensor_desc_1d(index_length));
+
+            Tensor<GammaDataType> gamma(f_host_tensor_desc_1d(current_dim));
+            Tensor<BetaDataType> beta(f_host_tensor_desc_1d(current_dim));
+
+            Tensor<OutType> out(f_host_tensor_desc_2d(index_length, current_dim));
+
+            emb_a.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+            emb_b.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+            emb_c.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+
+            index_a.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+            index_b.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+            index_c.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+
+            gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+            beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+
+            DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize());
+            DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize());
+            DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize());
+
+            DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize());
+            DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize());
+            DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize());
+
+            DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+            DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+
+            DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize());
+
+            emb_a_dev.ToDevice(emb_a.mData.data());
+            emb_b_dev.ToDevice(emb_b.mData.data());
+            emb_c_dev.ToDevice(emb_c.mData.data());
+
+            index_a_dev.ToDevice(index_a.mData.data());
+            index_b_dev.ToDevice(index_b.mData.data());
+            index_c_dev.ToDevice(index_c.mData.data());
+
+            gamma_dev.ToDevice(gamma.mData.data());
+            beta_dev.ToDevice(beta.mData.data());
+
+            auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
+            auto argument_ptr    = device_instance.MakeArgumentPointer(
+                out_dev.GetDeviceBuffer(),
+                {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
+                    ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
+                    ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
+                {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
+                    ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
+                    ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
+                gamma_dev.GetDeviceBuffer(),
+                beta_dev.GetDeviceBuffer(),
+                current_dim,
+                index_length,
+                epsilon,
+                EmbElementwiseOperation{});
+            std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
+                      << std::endl
+                      << std::flush;
+
+            bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get());
+
+            if(!is_supported)
+            {
+                std::cout << "Runtime parameters are not supported" << std::endl;
+                return;
+            }
+
+            auto invoker_ptr = device_instance.MakeInvokerPointer();
+            float time_ms =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            bool pass = true;
+            {
+                Tensor<OutType> out_from_dev(f_host_tensor_desc_2d(index_length, current_dim));
+                ReferenceInstance ref;
+                auto ref_argument = ref.MakeArgument(out,
+                                                     emb_a,
+                                                     emb_b,
+                                                     emb_c,
+                                                     index_a,
+                                                     index_b,
+                                                     index_c,
+                                                     gamma,
+                                                     beta,
+                                                     num_rows,
+                                                     current_dim,
+                                                     index_length,
+                                                     epsilon);
+                auto ref_invoker  = ref.MakeInvoker();
+                ref_invoker.Run(ref_argument);
+
+                out_dev.FromDevice(out_from_dev.mData.data());
+                pass &=
+                    ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3);
+            }
+
+            double total_read = current_dim * index_length * 3 * sizeof(EmbType) +
+                                current_dim * sizeof(GammaDataType) +
+                                current_dim * sizeof(BetaDataType);
+            double total_write = current_dim * index_length * sizeof(OutType);
+            double gbps        = (total_read + total_write) / time_ms / 1e6;
+
+            std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms
+                      << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl
+                      << std::flush;
         }
-
-        auto invoker_ptr = device_instance.MakeInvokerPointer();
-        float time_ms    = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-        bool pass = true;
-        {
-            Tensor<OutType> out_from_dev(f_host_tensor_desc_2d(index_length, current_dim));
-            ReferenceInstance ref;
-            auto ref_argument = ref.MakeArgument(out,
-                                                 emb_a,
-                                                 emb_b,
-                                                 emb_c,
-                                                 index_a,
-                                                 index_b,
-                                                 index_c,
-                                                 gamma,
-                                                 beta,
-                                                 num_rows,
-                                                 current_dim,
-                                                 index_length,
-                                                 epsilon);
-            auto ref_invoker  = ref.MakeInvoker();
-            ref_invoker.Run(ref_argument);
-
-            out_dev.FromDevice(out_from_dev.mData.data());
-            pass &= ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3);
-        }
-
-        double total_read = current_dim * index_length * 3 * sizeof(EmbType) +
-                            current_dim * sizeof(GammaDataType) +
-                            current_dim * sizeof(BetaDataType);
-        double total_write = current_dim * index_length * sizeof(OutType);
-        double gbps        = (total_read + total_write) / time_ms / 1e6;
-
-        std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms
-                  << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl
-                  << std::flush;
     });
 
     return 0;
diff --git a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
index 31d1bef520..8ddd432c11 100644
--- a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
@@ -68,6 +68,24 @@ int main(int argc, char* argv[])
     }
 
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        nchw[0] = std::stoi(argv[1]);
+        nchw[1] = std::stoi(argv[2]);
+        nchw[2] = std::stoi(argv[3]);
+        nchw[3] = std::stoi(argv[4]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 4: N, C, H, W" << std::endl;
+
+        return 1;
+    }
+
     std::array<ck::index_t, 4> ab_lengths;
     std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
                                              static_cast<int>(nchw[2] * nchw[3]),
diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
index 51006e676b..8064809123 100644
--- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -98,8 +98,23 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    ck::index_t M      = 48 * 256;
-    ck::index_t N      = 1024;
+    ck::index_t M = 48 * 256;
+    ck::index_t N = 1024;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 3)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 2: M, N" << std::endl;
+        return 1;
+    }
+
     ck::index_t Stride = N;
 
     auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
diff --git a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp
index 6cf1b2ff91..dcbb472118 100644
--- a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp
+++ b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp
@@ -100,7 +100,7 @@ using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizatio
     4,     // DGammaDstVectorSize
     4>;    // DBetaDstVectorSize
 
-int main()
+int main(int argc, char* argv[])
 {
     bool time_kernel = false;
 
@@ -110,6 +110,25 @@ int main()
     ck::index_t G = 32;
     ck::index_t C = 64;
 
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 6)
+    {
+        N = std::stoi(argv[1]);
+        H = std::stoi(argv[2]);
+        W = std::stoi(argv[3]);
+        G = std::stoi(argv[4]);
+        C = std::stoi(argv[5]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+
+        return 1;
+    }
+
     Tensor<DYDataType> dy({N, H, W, G, C});
     Tensor<XDataType> x({N, H, W, G, C});
     Tensor<GammaDataType> gamma({G, C});
diff --git a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
index 7cf0fed74f..537a4703d3 100644
--- a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
@@ -39,7 +39,8 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
                                  std::vector<index_t> window_strides,
                                  std::vector<index_t> window_dilations,
                                  std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
     constexpr index_t InOutRank  = 4;
     constexpr index_t WindowRank = 2;
@@ -166,6 +167,11 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
         {
             ++num_kernel;
             instance_found = true;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -249,7 +255,11 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "avg_pool2d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass && instance_found;
 }
 
diff --git a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
index fba8f6f67f..c97e42228d 100644
--- a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
@@ -48,7 +48,8 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
                                  std::vector<index_t> window_strides,
                                  std::vector<index_t> window_dilations,
                                  std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
     constexpr index_t InOutRank  = 5;
     constexpr index_t WindowRank = 3;
@@ -166,6 +167,11 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -246,7 +252,11 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "avg_pool3d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
index 2f6a50cbd4..ca0d031dba 100644
--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
@@ -49,10 +49,10 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
                                                          int O,
                                                          int G0,
                                                          int G1,
-                                                         float alpha = -1.f)
+                                                         float alpha        = -1.f,
+                                                         int instance_index = -1)
 
 {
-
     using PassThrough   = tensor_operation::element_wise::PassThrough;
     using ScaleAdd      = tensor_operation::element_wise::ScaleAdd;
     using AElementOp    = PassThrough;
@@ -277,7 +277,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
     {
@@ -314,6 +314,13 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
             std::string op_name = op_ptr->GetTypeString();
 
             float ave_time =
@@ -392,6 +399,11 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_bias_softmax_gemm_permute_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp
index 79ca7029c6..0fdda68c4d 100644
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -47,7 +47,8 @@ bool profile_batched_gemm_impl(int do_verification,
                                int BatchStrideA,
                                int BatchStrideB,
                                int BatchStrideC,
-                               int BatchCount)
+                               int BatchCount,
+                               int instance_index = -1)
 {
     bool pass = true;
 
@@ -138,6 +139,7 @@ bool profile_batched_gemm_impl(int do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
+    int num_kernel        = 0;
 
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
@@ -203,6 +205,12 @@ bool profile_batched_gemm_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // re-init C to zero before profiling next kernel
             c_device_buf.SetZero();
 
@@ -259,6 +267,11 @@ bool profile_batched_gemm_impl(int do_verification,
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
index 03fa1b1371..183b0e183a 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -40,19 +40,19 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
                                             int N,
                                             int K,
                                             int O,
-                                            int BatchCount    = 1,
-                                            int StrideA       = -1,
-                                            int StrideB0      = -1,
-                                            int StrideB1      = -1,
-                                            int StrideC       = -1,
-                                            int BatchStrideA  = -1,
-                                            int BatchStrideB0 = -1,
-                                            int BatchStrideB1 = -1,
-                                            int BatchStrideC  = -1,
-                                            float alpha       = -1.f)
+                                            int BatchCount     = 1,
+                                            int StrideA        = -1,
+                                            int StrideB0       = -1,
+                                            int StrideB1       = -1,
+                                            int StrideC        = -1,
+                                            int BatchStrideA   = -1,
+                                            int BatchStrideB0  = -1,
+                                            int BatchStrideB1  = -1,
+                                            int BatchStrideC   = -1,
+                                            float alpha        = -1.f,
+                                            int instance_index = -1)
 
 {
-
     using Row           = tensor_layout::gemm::RowMajor;
     using Col           = tensor_layout::gemm::ColumnMajor;
     using PassThrough   = tensor_operation::element_wise::PassThrough;
@@ -253,7 +253,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
     {
@@ -285,6 +285,13 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
             std::string op_name = op_ptr->GetTypeString();
 
             float ave_time =
@@ -341,7 +348,11 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
 
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_softmax_gemm_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
index 2945a4a66d..e953cc4b66 100644
--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
@@ -48,10 +48,10 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
                                                     int O,
                                                     int G0,
                                                     int G1,
-                                                    float alpha = -1.f)
+                                                    float alpha        = -1.f,
+                                                    int instance_index = -1)
 
 {
-
     using PassThrough   = tensor_operation::element_wise::PassThrough;
     using Scale         = tensor_operation::element_wise::Scale;
     using AElementOp    = PassThrough;
@@ -254,6 +254,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
+    int num_kernel        = 0;
 
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
@@ -287,6 +288,13 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
             std::string op_name = op_ptr->GetTypeString();
 
             float ave_time =
@@ -362,7 +370,11 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
 
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_softmax_gemm_permute_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
index 3343b5e66e..bf5a661407 100644
--- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
@@ -34,7 +34,8 @@ bool profile_batchnorm_backward_impl(bool do_verification,
                                      const std::vector<size_t> inOutLengths,
                                      const std::vector<int> reduceDims,
                                      bool haveSavedMeanInvVar,
-                                     double epsilon)
+                                     double epsilon,
+                                     index_t instance_index = -1)
 {
     if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
     {
@@ -293,6 +294,11 @@ bool profile_batchnorm_backward_impl(bool do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -382,7 +388,11 @@ bool profile_batchnorm_backward_impl(bool do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if (instance_index != -1)
+    {
+        std::cout << "batchnorm_backward_instance (" << instance_index << "/" << num_kernel
+            << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
index 2f9538b16c..078f6bff87 100644
--- a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
@@ -35,7 +35,8 @@ bool profile_batchnorm_forward_impl(int do_verification,
                                     bool updateMovingAverage,
                                     bool saveMeanAndInvVariance,
                                     double averageFactor,
-                                    double epsilon)
+                                    double epsilon,
+                                    index_t instance_index = -1)
 {
     if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
     {
@@ -287,6 +288,11 @@ bool profile_batchnorm_forward_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -404,7 +410,11 @@ bool profile_batchnorm_forward_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "batchnorm_forward_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
index 1b31a2aabf..c866b88e8a 100644
--- a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
@@ -32,7 +32,8 @@ bool profile_batchnorm_infer_impl(int do_verification,
                                   bool time_kernel,
                                   const std::vector<size_t> inOutLengths,
                                   const std::vector<int> reduceDims,
-                                  double epsilon)
+                                  double epsilon,
+                                  index_t instance_index = -1)
 {
     if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
     {
@@ -253,6 +254,11 @@ bool profile_batchnorm_infer_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -327,7 +333,11 @@ bool profile_batchnorm_infer_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if (instance_index != -1)
+    {
+        std::cout << "batchnorm_infer_instance (" << instance_index << "/" << num_kernel
+            << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_contraction_impl.hpp b/profiler/include/profiler/profile_contraction_impl.hpp
index 616e824ce1..361861a6d1 100644
--- a/profiler/include/profiler/profile_contraction_impl.hpp
+++ b/profiler/include/profiler/profile_contraction_impl.hpp
@@ -54,7 +54,8 @@ int profile_contraction_impl(ck::index_t do_verification,
                              const std::vector<ck::index_t>& StridesA, // [M0, M1, K0, K1]
                              const std::vector<ck::index_t>& StridesB, // [N0, N1, K0, K1]
                              const std::vector<ck::index_t>& StridesE, // [M0, M1, N0, N1]
-                             const std::vector<ck::index_t>& StridesD) // [M0, M1, N0, N1]
+                             const std::vector<ck::index_t>& StridesD, // [M0, M1, N0, N1]
+                             int instance_index = -1)
 {
     bool pass = true;
 
@@ -197,7 +198,7 @@ int profile_contraction_impl(ck::index_t do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     for(auto& op_ptr : op_ptrs)
     {
@@ -256,6 +257,12 @@ int profile_contraction_impl(ck::index_t do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // re-init C to zero before profiling next kernel
             e_device_buf.SetZero();
 
@@ -376,6 +383,11 @@ int profile_contraction_impl(ck::index_t do_verification,
               << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
               << best_op_name << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "contraction_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
index 5ea1a78094..8f7adebdd4 100644
--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -58,7 +58,8 @@ bool profile_conv_bwd_data_impl(int do_verification,
                                 int init_method,
                                 bool do_log,
                                 bool time_kernel,
-                                const ck::utils::conv::ConvParam& conv_param)
+                                const ck::utils::conv::ConvParam& conv_param,
+                                int instance_index = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -174,7 +175,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device Conv instances
     bool pass = true;
 
@@ -200,6 +201,12 @@ bool profile_conv_bwd_data_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // for conv bwd data, some input tensor element are zero, but not written by kernel,
             // need to set zero
             in_device_buf.SetZero();
@@ -263,7 +270,11 @@ bool profile_conv_bwd_data_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "conv_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp
index 37366821c4..200409fe61 100644
--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
@@ -36,7 +36,8 @@ bool profile_conv_fwd_impl(int do_verification,
                            int init_method,
                            bool do_log,
                            bool time_kernel,
-                           const ck::utils::conv::ConvParam& conv_param)
+                           const ck::utils::conv::ConvParam& conv_param,
+                           int instance_index = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -156,7 +157,7 @@ bool profile_conv_fwd_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     bool pass = true;
 
@@ -182,6 +183,12 @@ bool profile_conv_fwd_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // re-init output to zero before profiling next kernel
             out_device_buf.SetZero();
 
@@ -236,7 +243,11 @@ bool profile_conv_fwd_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "conv_fwd_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
index aafb7b260d..171ae1662b 100644
--- a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
+++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
@@ -122,7 +122,8 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
                                         int init_method,
                                         bool do_log,
                                         bool time_kernel,
-                                        const ck::utils::conv::ConvParam& conv_param)
+                                        const ck::utils::conv::ConvParam& conv_param,
+                                        index_t instance_index = -1)
 {
     const ck::index_t NDoHoWo =
         conv_param.N_ *
@@ -226,7 +227,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
     // profile device op instances
     bool pass                   = true;
     bool is_supporting_instance = false;
-
+    index_t num_kernel          = 0;
     for(auto& op_ptr : op_ptrs)
     {
         auto argument_ptr = op_ptr->MakeArgumentPointer(
@@ -247,6 +248,12 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             is_supporting_instance = true;
             // re-init output to zero before profiling next kernel
             out_device_buf.SetZero();
@@ -291,6 +298,11 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\nGB/s: " << best_gb_per_sec << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "conv_tensor_rearrange_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return is_supporting_instance && pass;
 }
 
diff --git a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
index 220076465d..ca08f48bcf 100644
--- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
@@ -49,7 +49,8 @@ bool profile_elementwise_layernorm_impl(int do_verification,
                                         int init_method,
                                         bool do_log,
                                         bool time_kernel,
-                                        std::vector<index_t> length)
+                                        std::vector<index_t> length,
+                                        index_t instance_index = -1)
 {
     using Add         = ck::tensor_operation::element_wise::Add;
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -199,6 +200,11 @@ bool profile_elementwise_layernorm_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -270,6 +276,11 @@ bool profile_elementwise_layernorm_impl(int do_verification,
         return false;
     }
 
+    if(instance_index != -1)
+    {
+        std::cout << "elementwise_layernorm_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
index 470cc86d1b..74a1b60fe3 100644
--- a/profiler/include/profiler/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
@@ -70,7 +70,8 @@ bool profile_gemm_reduce_impl(int do_verification,
                               int K,
                               int StrideA,
                               int StrideB,
-                              int StrideC)
+                              int StrideC,
+                              int instance_index = -1)
 {
     bool pass = true;
 
@@ -249,7 +250,7 @@ bool profile_gemm_reduce_impl(int do_verification,
     float best_ave_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device GEMM instances
     for(auto& gemm_ptr : gemm_ptrs)
     {
@@ -275,6 +276,12 @@ bool profile_gemm_reduce_impl(int do_verification,
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
             // init DO, D1 to 0
             reduce0_device_buf.SetZero();
             reduce1_device_buf.SetZero();
@@ -345,7 +352,11 @@ bool profile_gemm_reduce_impl(int do_verification,
 
     std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
               << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "gemm_reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
index 8032730199..744db27675 100644
--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -44,7 +44,8 @@ bool profile_gemm_splitk_impl(int do_verification,
                               int StrideC,
                               int KBatch,
                               int n_warmup,
-                              int n_iter)
+                              int n_iter,
+                              int instance_index = -1)
 {
     bool pass = true;
 
@@ -141,6 +142,7 @@ bool profile_gemm_splitk_impl(int do_verification,
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
     float best_kbatch     = 0;
+    int num_kernel        = 0;
 
     // profile device GEMM instances
     for(auto& op_ptr : op_ptrs)
@@ -175,7 +177,12 @@ bool profile_gemm_splitk_impl(int do_verification,
 
             if(op_ptr->IsSupportedArgument(argument_ptr.get()))
             {
-
+                ++num_kernel;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
                 // re-init C to zero before profiling next kernel
                 c_device_buf.SetZero();
 
@@ -294,7 +301,11 @@ bool profile_gemm_splitk_impl(int do_verification,
               << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
               << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
               << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "gemm_splitk_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 29b2fece6b..a7c6717f58 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -35,7 +35,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                         bool do_log,
                                         bool time_kernel,
                                         const ck::utils::conv::ConvParam& conv_param,
-                                        ck::index_t split_k = 1)
+                                        ck::index_t split_k    = 1,
+                                        index_t instance_index = -1)
 {
     using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -123,9 +124,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     ck::index_t best_split_k = 1;
 
     // profile device op instances
-    bool pass = true;
-
-    auto run_impl = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
+    bool pass          = true;
+    index_t num_kernel = 0;
+    auto run_impl      = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
         // workspace_sz will be equal to 0 for other layout than NGCHW
         const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
         DeviceMem workspace_dev(workspace_sz);
@@ -133,6 +134,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                return;
+            }
             std::string op_name = op_ptr->GetTypeString();
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -165,8 +172,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                 in_device_buf.FromDevice(in_device.mData.data());
 
                 using ComputeType = std::conditional_t<sizeof(OutDataType) < sizeof(WeiDataType),
-                                                       OutDataType,
-                                                       WeiDataType>;
+                                                            OutDataType,
+                                                            WeiDataType>;
                 using AccDataType =
                     std::conditional_t<std::is_same_v<ComputeType, int8_t>, int32_t, float>;
                 const index_t num_accums = conv_param.K_;
@@ -297,6 +304,11 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 479fed78e7..6654275fd0 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -41,7 +41,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                                           bool do_log,
                                           bool time_kernel,
                                           const ck::utils::conv::ConvParam& conv_param,
-                                          const std::string& split_k)
+                                          const std::string& split_k,
+                                          index_t instance_index = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -187,6 +188,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
         }
     }
 
+    index_t num_kernel = 0;
     for(auto& op_ptr : op_ptrs)
     {
         for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
@@ -226,6 +228,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
 
             if(op_ptr->IsSupportedArgument(argument_ptr.get()))
             {
+                num_kernel++;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
 
                 std::string op_name = op_ptr->GetTypeString();
 
@@ -326,6 +334,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl;
 
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_bwd_weight_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return all_pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
index 91ac2a0ab6..2f7f3ae4d8 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
@@ -126,7 +126,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                               int init_method,
                                               bool do_log,
                                               bool time_kernel,
-                                              const ck::utils::conv::ConvParam& conv_param)
+                                              const ck::utils::conv::ConvParam& conv_param,
+                                              int instance_index = -1)
 {
     const float floor   = 0.f;
     const float ceil    = 2048.f;
@@ -295,6 +296,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
+    int num_kernel        = 0;
 
     // profile device op instances
     bool pass = true;
@@ -307,6 +309,13 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                std::cout << op_ptr->GetTypeString() << " skipped" << std::endl;
+                return;
+            }
             // re-init output to zero before profiling next kernel
             out_device_buf.SetZero();
 
@@ -420,7 +429,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_bias_bnorm_clamp_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
index 188d7aa0b0..2dbadd8eb1 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
@@ -64,7 +64,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                               int init_method,
                                               bool do_log,
                                               bool time_kernel,
-                                              const ck::utils::conv::ConvParam& conv_param)
+                                              const ck::utils::conv::ConvParam& conv_param,
+                                              int instance_index = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -194,7 +195,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
     // profile device op instances
     bool pass = true;
 
@@ -206,6 +207,13 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                std::cout << op_ptr->GetTypeString() << " skipped" << std::endl;
+                return;
+            }
             // re-init output to zero before profiling next kernel
             out_device_buf.SetZero();
 
@@ -317,7 +325,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_bias_clamp_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index 2dcee4c1fc..d490cf4167 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -42,7 +42,8 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
                                    bool do_log,
                                    bool time_kernel,
                                    const ck::utils::conv::ConvParam& conv_param,
-                                   const OutElementOp out_element_op = OutElementOp{})
+                                   const OutElementOp out_element_op = OutElementOp{},
+                                   index_t instance_index            = -1)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -144,7 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
     float best_avg_time   = 0;
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
-
+    index_t num_kernel    = 0;
     // profile device op instances
     bool pass = true;
 
@@ -156,6 +157,13 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                return;
+            }
+
             std::string op_name = op_ptr->GetTypeString();
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -253,7 +261,11 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
     std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
               << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
               << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index eef5e02911..8314b9053f 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -44,7 +44,8 @@ bool profile_grouped_gemm_impl(int do_verification,
                                const std::vector<int>& StrideCs,
                                const std::vector<int>& kbatches = {},
                                int n_warmup                     = 1,
-                               int n_iter                       = 10)
+                               int n_iter                       = 10,
+                               int instance_index               = -1)
 {
     bool pass = true;
     // TODO: Fixme - we do not pass compute data type here but need it
@@ -195,8 +196,8 @@ bool profile_grouped_gemm_impl(int do_verification,
     float best_tflops     = 0;
     float best_gb_per_sec = 0;
     float best_kbatch     = 0;
-
-    auto p_ds = std::vector<std::array<const void*, 0>>{};
+    int num_kernel        = 0;
+    auto p_ds             = std::vector<std::array<const void*, 0>>{};
 
     if(do_verification)
     {
@@ -279,6 +280,13 @@ bool profile_grouped_gemm_impl(int do_verification,
 
             if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
             {
+                ++num_kernel;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
+
                 for(std::size_t i = 0; i < gemm_descs.size(); i++)
                     c_device_buf[i]->SetZero();
 
@@ -371,7 +379,11 @@ bool profile_grouped_gemm_impl(int do_verification,
                   << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch
                   << std::endl;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_gemm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
index 55ea08e0db..c1647815ad 100644
--- a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
@@ -26,7 +26,8 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
                                      bool time_kernel,
-                                     std::vector<index_t> length)
+                                     std::vector<index_t> length,
+                                     index_t instance_index = -1)
 {
     // we don't need DGamma and DBeta here, just for reference class
     using DGammaDataType = DXDataType;
@@ -162,6 +163,11 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -242,7 +248,11 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "groupnorm_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp
index d0a5032bff..60982d18d5 100644
--- a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp
@@ -29,7 +29,8 @@ bool profile_groupnorm_impl(int do_verification,
                             int init_method,
                             bool do_log,
                             bool time_kernel,
-                            std::vector<index_t> length)
+                            std::vector<index_t> length,
+                            index_t instance_index = -1)
 {
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
@@ -178,6 +179,11 @@ bool profile_groupnorm_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -267,6 +273,12 @@ bool profile_groupnorm_impl(int do_verification,
         return false;
     }
 
+    if(instance_index != -1)
+    {
+        std::cout << "groupnorm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
+
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
index e88a06122d..7704085048 100644
--- a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
@@ -27,7 +27,8 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
                                      bool time_kernel,
-                                     std::vector<index_t> length)
+                                     std::vector<index_t> length,
+                                     index_t instance_index = -1)
 {
     // we don't need DGamma and DBeta here, just for reference class
     using DGammaDataType = DXDataType;
@@ -167,6 +168,11 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -247,7 +253,11 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
index 10fa9c86d5..e36b20e1b5 100644
--- a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
@@ -27,7 +27,8 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
                                            int init_method,
                                            bool do_log,
                                            bool time_kernel,
-                                           std::vector<index_t> length)
+                                           std::vector<index_t> length,
+                                           index_t instance_index = -1)
 {
     // we don't need GammaDataType and DXDataType here, just for reference class
     using GammaDataType = DYDataType;
@@ -178,6 +179,11 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -255,7 +261,11 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_bwd_gamma_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp
index 66272b6eff..51dcbb1275 100644
--- a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp
@@ -28,7 +28,8 @@ bool profile_layernorm_impl(int do_verification,
                             int init_method,
                             bool do_log,
                             bool time_kernel,
-                            std::vector<index_t> length)
+                            std::vector<index_t> length,
+                            index_t instance_index = -1)
 {
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
@@ -188,6 +189,11 @@ bool profile_layernorm_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -286,6 +292,12 @@ bool profile_layernorm_impl(int do_verification,
         return false;
     }
 
+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
+
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
index 6e3de3a26a..a8efee3ef0 100644
--- a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
@@ -34,7 +34,8 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
                                  std::vector<index_t> window_strides,
                                  std::vector<index_t> window_dilations,
                                  std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
     // AtomicAdd only support f32 for now. ComputeDataType must be float32
     using ComputeDataType = float;
@@ -199,6 +200,11 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
         {
             ++num_kernel;
             instance_found = true;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -289,7 +295,11 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool2d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return pass && instance_found;
 }
 
diff --git a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
index 407337f827..cf6050969f 100644
--- a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
@@ -34,7 +34,8 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
                                  std::vector<index_t> window_strides,
                                  std::vector<index_t> window_dilations,
                                  std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
     // AtomicAdd only support f32 for now. ComputeDataType must be float32
     using ComputeDataType = float;
@@ -193,6 +194,11 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -281,7 +287,11 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool3d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
index 88162b9417..962be4448c 100644
--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
@@ -35,7 +35,8 @@ bool profile_pool2d_fwd_impl(int do_verification,
                              std::vector<index_t> window_strides,
                              std::vector<index_t> window_dilations,
                              std::vector<index_t> input_left_pads,
-                             std::vector<index_t> input_right_pads)
+                             std::vector<index_t> input_right_pads,
+                             index_t instance_index = -1)
 {
     constexpr index_t InOutRank  = 4;
     constexpr index_t WindowRank = 2;
@@ -171,6 +172,11 @@ bool profile_pool2d_fwd_impl(int do_verification,
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -268,7 +274,11 @@ bool profile_pool2d_fwd_impl(int do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool2d_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
index 412946d558..e1d0c1573d 100644
--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -46,7 +46,9 @@ template <typename InDataType,
           ck::ReduceTensorOp ReduceOpId,
           bool PropagateNan,
           bool OutputIndex>
-bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params)
+bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params,
+                             PoolFwdKernelParams& kernel_params,
+                             index_t instance_index = -1)
 {
     constexpr index_t InOutRank  = 5;
     constexpr index_t WindowRank = 3;
@@ -199,6 +201,11 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
         if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
         {
             ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
         }
         else
         {
@@ -328,7 +335,11 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool3d_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
     return true;
 }
 
diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp
index b54aa65aef..14a93af69d 100644
--- a/profiler/include/profiler/profile_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_reduce_impl.hpp
@@ -144,7 +144,8 @@ bool profile_reduce_impl_impl(bool do_verification,
                               const std::vector<size_t>& inLengths,
                               const std::array<int, NumReduceDim>& reduceDims,
                               float alpha,
-                              float beta)
+                              float beta,
+                              index_t instance_index = -1)
 {
     using namespace ck::tensor_operation::device;
     using namespace ck::tensor_operation::device::instance;
@@ -373,7 +374,14 @@ bool profile_reduce_impl_impl(bool do_verification,
             if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
                 continue;
             else
+            {
                 num_kernel++;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
+            }
 
             std::string reduce_name = reduce_ptr->GetTypeString();
 
@@ -452,7 +460,11 @@ bool profile_reduce_impl_impl(bool do_verification,
         std::cout << "Error: No kernel is applicable" << std::endl;
         return false;
     };
-
+    if(instance_index != -1)
+    {
+        std::cout << "reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return pass;
 };
 
@@ -467,7 +479,8 @@ bool profile_reduce_impl(bool do_verification,
                          bool PropagateNan,
                          bool UseIndex,
                          float alpha,
-                         float beta)
+                         float beta,
+                         index_t instance_index = -1)
 {
     bool matched = false;
     bool pass    = true;
@@ -505,7 +518,8 @@ bool profile_reduce_impl(bool do_verification,
                                                                      inLengths,
                                                                      arrReduceDims,
                                                                      alpha,
-                                                                     beta);
+                                                                     beta,
+                                                                     instance_index);
 
         matched = true;
     });
diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
index 83913d8398..d7a790803a 100644
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -53,7 +53,8 @@ bool profile_softmax_impl(int do_verification,
                           std::vector<index_t> in_strides,
                           std::vector<index_t> reduce_dims,
                           double alpha,
-                          double beta)
+                          double beta,
+                          index_t instance_index = -1)
 {
     if(Rank != in_length.size())
     {
@@ -124,7 +125,7 @@ bool profile_softmax_impl(int do_verification,
     float best_avg_time   = std::numeric_limits<float>::max();
     float best_gb_per_sec = 0;
     std::vector<bool> instance_pass;
-
+    index_t num_kernel = 0;
     for(auto& inst_ptr : instances)
     {
         auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
@@ -146,6 +147,15 @@ bool profile_softmax_impl(int do_verification,
             instance_pass.push_back(true);
             continue;
         }
+        else
+        {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+        }
 
         out_dev.ToDevice(prior_out.data());
         auto invoker_ptr = inst_ptr->MakeInvokerPointer();
@@ -216,6 +226,11 @@ bool profile_softmax_impl(int do_verification,
         std::cout << "alpha = " << alpha << ", " << "beta = " << beta << ", " << best_avg_time
                   << " ms, " << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
     }
+    if(instance_index != -1)
+    {
+        std::cout << "reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
     return std::all_of(
         std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
 }
diff --git a/test/batched_gemm/test_batched_gemm_wmma.cpp b/test/batched_gemm/test_batched_gemm_wmma.cpp
index 18f9db8c39..fc190bed85 100644
--- a/test/batched_gemm/test_batched_gemm_wmma.cpp
+++ b/test/batched_gemm/test_batched_gemm_wmma.cpp
@@ -12,7 +12,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
-
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 struct GemmParams
 {
     ck::index_t M;
@@ -37,96 +38,153 @@ class TestBatchedGemm : public ::testing::Test
         using namespace ck::tensor_operation::device;
 
         bool pass = true;
-        for(auto& param : params)
+        for(size_t i = 0; i < params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param           = params[i];
             const auto M          = param.M;
             const auto N          = param.N;
             const auto K          = param.K;
             const auto BatchCount = param.BatchCount;
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -191,3 +249,20 @@ TEST_F(TestBatchedGemm, fp16)
 //     this->template Run<float>();
 // }
 // #endif
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm/test_batched_gemm_xdl.cpp b/test/batched_gemm/test_batched_gemm_xdl.cpp
index f9bb626ce5..3b7c392004 100644
--- a/test/batched_gemm/test_batched_gemm_xdl.cpp
+++ b/test/batched_gemm/test_batched_gemm_xdl.cpp
@@ -13,6 +13,9 @@
 
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 struct GemmParams
 {
     ck::index_t M;
@@ -37,96 +40,153 @@ class TestBatchedGemm : public ::testing::Test
         using namespace ck::tensor_operation::device;
 
         bool pass = true;
-        for(auto& param : params)
+        for(size_t i = 0; i < params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param           = params[i];
             const auto M          = param.M;
             const auto N          = param.N;
             const auto K          = param.K;
             const auto BatchCount = param.BatchCount;
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
 
-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -183,3 +243,20 @@ TEST_F(TestBatchedGemm, fp32)
     this->template Run<float>();
 }
 #endif
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp
index eba461a420..5e250bc356 100644
--- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp
@@ -7,6 +7,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
 
+static ck::index_t instance_index = -1;
+
 namespace {
 using F16 = ck::half_t;
 
@@ -70,7 +72,8 @@ class TestBatchedGemmMultiD : public ::testing::Test
                 M * K,
                 K * N,
                 M * N,
-                BatchCount);
+                BatchCount,
+                instance_index);
         EXPECT_TRUE(pass);
     }
 };
@@ -88,3 +91,18 @@ TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
 #ifdef CK_ENABLE_INT8
 TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
 #endif
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp
index cb46a995c6..1ab29f251a 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp
@@ -4,6 +4,9 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchedGemmSoftmaxGemmFP16 : public TestBatchedGemmSoftmaxGemm<Tuple>
 {
@@ -174,3 +177,20 @@ TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
index 2611f91e66..8074d8a311 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -9,6 +9,9 @@
 #include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 template <ck::index_t N>
 using I = ck::Number<N>;
 
@@ -57,15 +60,38 @@ struct TestBatchedGemmSoftmaxGemm : public ::testing::Test
                                                                          B1Layout,
                                                                          CLayout,
                                                                          MaskingType::value>(
-            verify_, 1, false, bench_, M, N, K, O, BatchCount);
+            verify_,
+            1,
+            false,
+            bench_,
+            M,
+            N,
+            K,
+            O,
+            BatchCount,
+            -1, //  StrideA
+            -1, // StrideB0
+            -1, // StrideB1
+            -1, // StrideC
+            -1, // BatchStrideA
+            -1, //  BatchStrideB0
+            -1, // BatchStrideB1
+            -1, // BatchStrideC
+            -1, // alpha
+            instance_index);
 
         EXPECT_TRUE(pass);
     }
 
     void Run()
     {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths  = this->lengths_[i];
             int M          = lengths[0];
             int N          = lengths[1];
             int K          = lengths[2];
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp
index ef88ce6d81..9ce603c575 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp
@@ -4,6 +4,8 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_bias_softmax_gemm_permute_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
     : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -180,3 +182,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp
index b38b10d195..40ce64837d 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp
@@ -4,6 +4,8 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
     : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -180,3 +182,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
index eda74819e9..e37cadd0c5 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
@@ -10,7 +10,8 @@
 #include "profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp"
 
 #include <hip/hip_runtime.h>
-
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
 using ck::tensor_operation::device::TensorSpecialization;
@@ -66,21 +67,26 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
                                                                               Acc0BiasDataType,
                                                                               Acc1BiasDataType,
                                                                               MaskingType::value>(
-                verify_, 2, false, bench_, M, N, K, O, G0, G1);
+                verify_, 2, false, bench_, M, N, K, O, G0, G1, -1, instance_index);
 
         EXPECT_TRUE(pass);
     }
 
     void Run()
     {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
         {
-            int M  = lengths[0];
-            int N  = lengths[1];
-            int K  = lengths[2];
-            int O  = lengths[3];
-            int G0 = lengths[4];
-            int G1 = lengths[5];
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths = this->lengths_[i];
+            int M         = lengths[0];
+            int N         = lengths[1];
+            int K         = lengths[2];
+            int O         = lengths[3];
+            int G0        = lengths[4];
+            int G1        = lengths[5];
 
             this->RunSingle(M, N, K, O, G0, G1);
         }
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
index 8d894576c4..b75b7e43cf 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
@@ -5,6 +5,8 @@
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
 #include "test_batched_gemm_device_utils.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
     : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -228,3 +230,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
index 3a86736f44..61baa50cd7 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
@@ -5,6 +5,9 @@
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
 #include "test_batched_gemm_device_utils.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
     : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -191,3 +194,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest)
     };
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
index d9177ff0f2..13d2e0f0a2 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -9,6 +9,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
 using ck::tensor_operation::device::TensorSpecialization;
@@ -64,21 +66,26 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
                                                                          ck::Tuple<>,
                                                                          ck::Tuple<>,
                                                                          MaskingType::value>(
-                verify_, 2, false, bench_, M, N, K, O, G0, G1);
+                verify_, 2, false, bench_, M, N, K, O, G0, G1, -1, instance_index);
 
         EXPECT_TRUE(pass);
     }
 
     void Run()
     {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
         {
-            int M  = lengths[0];
-            int N  = lengths[1];
-            int K  = lengths[2];
-            int O  = lengths[3];
-            int G0 = lengths[4];
-            int G1 = lengths[5];
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths = this->lengths_[i];
+            int M         = lengths[0];
+            int N         = lengths[1];
+            int K         = lengths[2];
+            int O         = lengths[3];
+            int G0        = lengths[4];
+            int G1        = lengths[5];
 
             this->RunSingle(M, N, K, O, G0, G1);
         }
diff --git a/test/batchnorm/batchnorm_bwd_rank_4.cpp b/test/batchnorm/batchnorm_bwd_rank_4.cpp
index cc514261e6..66908360d1 100644
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -15,6 +15,9 @@ using F32  = float;
 using BF16 = ck::bhalf_t;
 using F64  = double;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchNormBwdRank4 : public ::testing::Test
 {
@@ -37,33 +40,48 @@ class TestBatchNormBwdRank4 : public ::testing::Test
     template <int NumReduceDim>
     void Run()
     {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
         {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;
 
             EXPECT_FALSE(reduceDims.size() != NumReduceDim);
 
-            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
-                                                                         DxDataType,
-                                                                         DyDataType,
-                                                                         AccDataType,
-                                                                         ScaleDataType,
-                                                                         BiasDataType,
-                                                                         MeanVarDataType,
-                                                                         4,
-                                                                         NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, true, epsilon);
+            pass =
+                pass &&
+                ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                              DxDataType,
+                                                              DyDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                    true, 3, false, false, inOutLengths, reduceDims, true, epsilon, instance_index);
 
-            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
-                                                                         DxDataType,
-                                                                         DyDataType,
-                                                                         AccDataType,
-                                                                         ScaleDataType,
-                                                                         BiasDataType,
-                                                                         MeanVarDataType,
-                                                                         4,
-                                                                         NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, false, epsilon);
+            pass =
+                pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                      DxDataType,
+                                                                      DyDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      4,
+                                                                      NumReduceDim>(true,
+                                                                                    3,
+                                                                                    false,
+                                                                                    false,
+                                                                                    inOutLengths,
+                                                                                    reduceDims,
+                                                                                    false,
+                                                                                    epsilon,
+                                                                                    instance_index);
 
             EXPECT_TRUE(pass);
         }
@@ -103,3 +121,19 @@ TYPED_TEST(TestBatchNormBwdRank4, nchw)
     this->reduceDims = {0, 2, 3};
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batchnorm/batchnorm_fwd_rank_4.cpp b/test/batchnorm/batchnorm_fwd_rank_4.cpp
index 6bf635f0cd..8d81a3892c 100644
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -16,6 +16,9 @@ using BF16 = ck::bhalf_t;
 using I8   = int8_t;
 using F64  = double;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchNormFwdRank4 : public ::testing::Test
 {
@@ -38,9 +41,14 @@ class TestBatchNormFwdRank4 : public ::testing::Test
     template <int NumReduceDim>
     void Run()
     {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
         {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;
 
             EXPECT_FALSE(reduceDims.size() != NumReduceDim);
 
@@ -61,7 +69,8 @@ class TestBatchNormFwdRank4 : public ::testing::Test
                                                                                    true,
                                                                                    true,
                                                                                    epsilon,
-                                                                                   averageFactor);
+                                                                                   averageFactor,
+                                                                                   instance_index);
 
             pass =
                 pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
@@ -80,7 +89,8 @@ class TestBatchNormFwdRank4 : public ::testing::Test
                                                                                    false,
                                                                                    false,
                                                                                    epsilon,
-                                                                                   averageFactor);
+                                                                                   averageFactor,
+                                                                                   instance_index);
 
             EXPECT_TRUE(pass);
         }
@@ -120,3 +130,19 @@ TYPED_TEST(TestBatchNormFwdRank4, nchw)
     this->reduceDims = {0, 2, 3};
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/batchnorm/batchnorm_infer_rank_4.cpp b/test/batchnorm/batchnorm_infer_rank_4.cpp
index 0165192acf..41c9cdb94e 100644
--- a/test/batchnorm/batchnorm_infer_rank_4.cpp
+++ b/test/batchnorm/batchnorm_infer_rank_4.cpp
@@ -10,6 +10,9 @@
 
 #include "profiler/profile_batchnorm_infer_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 using F16  = ck::half_t;
 using F32  = float;
 using BF16 = ck::bhalf_t;
@@ -36,31 +39,38 @@ class TestBatchNormInferRank4 : public ::testing::Test
     template <int NumReduceDim>
     void Run()
     {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
         {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;
 
             EXPECT_FALSE(reduceDims.size() != NumReduceDim);
 
-            pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
-                                                                      YDataType,
-                                                                      AccDataType,
-                                                                      ScaleDataType,
-                                                                      BiasDataType,
-                                                                      MeanVarDataType,
-                                                                      4,
-                                                                      NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, epsilon);
+            pass = pass &&
+                   ck::profiler::profile_batchnorm_infer_impl<XDataType,
+                                                              YDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                       true, 3, false, false, inOutLengths, reduceDims, epsilon, instance_index);
 
-            pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
-                                                                      YDataType,
-                                                                      AccDataType,
-                                                                      ScaleDataType,
-                                                                      BiasDataType,
-                                                                      MeanVarDataType,
-                                                                      4,
-                                                                      NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, epsilon);
+            pass = pass &&
+                   ck::profiler::profile_batchnorm_infer_impl<XDataType,
+                                                              YDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                       true, 3, false, false, inOutLengths, reduceDims, epsilon, instance_index);
 
             EXPECT_TRUE(pass);
         }
@@ -100,3 +110,20 @@ TYPED_TEST(TestBatchNormInferRank4, nchw)
     this->reduceDims = {0, 2, 3};
     this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/contraction/test_contraction_xdl.cpp b/test/contraction/test_contraction_xdl.cpp
index 2bfd5a6a66..3a65b57b0e 100644
--- a/test/contraction/test_contraction_xdl.cpp
+++ b/test/contraction/test_contraction_xdl.cpp
@@ -12,10 +12,11 @@
 #include "profiler/profile_contraction_impl.hpp"
 #include "profiler/profile_contraction_utils.hpp"
 
-using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
-using F64  = double;
+static ck::index_t instance_index = -1;
+using F16                         = ck::half_t;
+using BF16                        = ck::bhalf_t;
+using F32                         = float;
+using F64                         = double;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -95,7 +96,8 @@ class TestContraction : public ::testing::Test
                                                                     StridesA,
                                                                     StridesB,
                                                                     StridesC,
-                                                                    StridesD);
+                                                                    StridesD,
+                                                                    instance_index);
             EXPECT_TRUE(pass);
         }
     }
@@ -219,3 +221,18 @@ TYPED_TEST(TestContractionScaleMixedPrecision, scale)
     this->template Run<2>({{8, 16}, {1, 1}, {8, 16}});
     this->template Run<2>({{1, 1}, {1, 1}, {1, 1}});
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
index 5cb8731b26..8904b58d8d 100644
--- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
+++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
@@ -11,6 +11,9 @@
 
 #include "profiler/profile_conv_tensor_rearrange_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestConvTensorRearrange : public ::testing::Test
 {
@@ -25,18 +28,24 @@ class TestConvTensorRearrange : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
-                                                                            ImLayout,
-                                                                            InDataType,
-                                                                            OutDataType,
-                                                                            ConvTensorRearrangeOp>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
+                                                                                   ImLayout,
+                                                                                   InDataType,
+                                                                                   OutDataType,
+                                                                                   ConvTensorRearrangeOp>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -157,3 +166,19 @@ TYPED_TEST(TestConvTensorRearrange3d, Test3D)
     this->template Run<3, int8_t, int8_t>();
 #endif
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp b/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp
index 9d2b6cf577..5ad4f63d30 100644
--- a/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp
@@ -9,7 +9,8 @@
 #include <gtest/gtest.h>
 
 #include "profiler/profile_conv_bwd_data_impl.hpp"
-
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestConvndBwdData : public ::testing::Test
 {
@@ -20,10 +21,15 @@ class TestConvndBwdData : public ::testing::Test
     template <ck::index_t NDimSpatial>
     void Run()
     {
-        for(auto& param : conv_params)
+        EXPECT_FALSE(conv_params.empty());
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             bool pass;
-            EXPECT_FALSE(conv_params.empty());
             pass = ck::profiler::profile_conv_bwd_data_impl<
                 NDimSpatial,
                 ck::tuple_element_t<NDimSpatial - 1,
@@ -44,7 +50,8 @@ class TestConvndBwdData : public ::testing::Test
                           1,     // init_method integer value
                           false, // do_log
                           false, // time_kernel
-                          param);
+                          param,
+                          instance_index);
             EXPECT_TRUE(pass);
         }
     }
@@ -91,3 +98,19 @@ TYPED_TEST(TestConvndBwdData, Conv3dBwdData)
         {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/convnd_fwd/convnd_fwd_xdl.cpp b/test/convnd_fwd/convnd_fwd_xdl.cpp
index fe8798ceb8..6d507211ce 100644
--- a/test/convnd_fwd/convnd_fwd_xdl.cpp
+++ b/test/convnd_fwd/convnd_fwd_xdl.cpp
@@ -10,6 +10,8 @@
 
 #include "profiler/profile_conv_fwd_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestConvndFwd : public ::testing::Test
 {
@@ -20,10 +22,15 @@ class TestConvndFwd : public ::testing::Test
     template <ck::index_t NDimSpatial>
     void Run()
     {
-        for(auto& param : conv_params)
+        EXPECT_FALSE(conv_params.empty());
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             bool pass;
-            EXPECT_FALSE(conv_params.empty());
             pass = ck::profiler::profile_conv_fwd_impl<
                 NDimSpatial,
                 ck::tuple_element_t<NDimSpatial - 1,
@@ -44,7 +51,8 @@ class TestConvndFwd : public ::testing::Test
                           1,     // init_method integer value
                           false, // do_log
                           false, // time_kernel
-                          param);
+                          param,
+                          instance_index);
             EXPECT_TRUE(pass);
         }
     }
@@ -90,3 +98,19 @@ TYPED_TEST(TestConvndFwd, Conv3dFwd)
         {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
index d5ce77dc2b..43192ed139 100644
--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestElementwiseLayernorm : public ::testing::Test
 {
@@ -25,15 +28,20 @@ class TestElementwiseLayernorm : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}, {4096, 8192}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& length = lengths[i];
             bool success = ck::profiler::profile_elementwise_layernorm_impl<ADataType,
                                                                             BDataType,
                                                                             GammaDataType,
                                                                             BetaDataType,
                                                                             AccDataType,
                                                                             YDataType>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -45,3 +53,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes);
 TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp
index cde5c45aea..d06735a097 100644
--- a/test/gemm/gemm_bf16.cpp
+++ b/test/gemm/gemm_bf16.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp
index cad250c6fb..185412ab65 100644
--- a/test/gemm/gemm_fp16.cpp
+++ b/test/gemm/gemm_fp16.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp
index c35aa77ea7..cf2d0bd01d 100644
--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp
index e67c8ba4f3..7bf89d9c20 100644
--- a/test/gemm/gemm_fp64.cpp
+++ b/test/gemm/gemm_fp64.cpp
@@ -31,4 +31,4 @@ using AccDataType = double;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp
index 6ece05e306..f1a19dd61a 100644
--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
@@ -31,4 +31,4 @@ using AccDataType = int32_t;
 
 #include "run_gemm_test.inc"
 
-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/test/gemm/gemm_standalone_xdl_fp16.cpp
index 201a49dcd3..bee2d1ec80 100644
--- a/test/gemm/gemm_standalone_xdl_fp16.cpp
+++ b/test/gemm/gemm_standalone_xdl_fp16.cpp
@@ -105,6 +105,7 @@ int main(int argc, char* argv[])
 
     bool do_verification = true;
     bool time_kernel     = true;
+    int problem_index    = -1;
 
     if(argc == 1)
     {
@@ -115,16 +116,28 @@ int main(int argc, char* argv[])
         do_verification = std::stoi(argv[1]);
         time_kernel     = std::stoi(argv[2]);
     }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+        problem_index   = std::stoi(argv[3]);
+    }
     else
     {
         std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
-                  << "arg2: time kernel (0=no, 1=yes)" << std::endl;
+                  << "arg2: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg3: problem index (0-35, -1 means all)" << std::endl;
         return 0;
     }
 
     bool pass = true;
-    for(auto& p : problems)
+    for(size_t i = 0; i < problems.size(); i++)
     {
+        if(problem_index != -1 && problem_index != static_cast<ck::index_t>(i))
+        {
+            continue;
+        }
+        auto& p                           = problems[i];
         GemmParams& problem_size          = std::get<0>(p);
         const LayoutConfig& layout_config = std::get<1>(p);
         const auto& factory               = std::get<2>(p);
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 6c46f4ee89..043eca0e83 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -261,6 +261,44 @@ struct TestGemm
             return true;
         }
     }
+
+    template <template <class...> class DeviceGemmPtr_,
+              typename ALayout,
+              typename BLayout,
+              typename CLayout,
+              typename ADataType,
+              typename BDataType,
+              typename CDataType,
+              typename AElementwiseOperation,
+              typename BElementwiseOperation,
+              typename CElementwiseOperation>
+    bool IsSupportedArgument(DeviceGemmPtr_<ALayout,
+                                            BLayout,
+                                            CLayout,
+                                            ADataType,
+                                            BDataType,
+                                            CDataType,
+                                            AElementwiseOperation,
+                                            BElementwiseOperation,
+                                            CElementwiseOperation>* gemmPtr,
+                             const GemmParams& params = GemmParams{})
+    {
+        auto invoker_ptr  = gemmPtr->MakeInvokerPointer();
+        auto argument_ptr = gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(nullptr),
+                                                         static_cast<BDataType*>(nullptr),
+                                                         static_cast<CDataType*>(nullptr),
+                                                         params.M,
+                                                         params.N,
+                                                         params.K,
+                                                         params.StrideA,
+                                                         params.StrideB,
+                                                         params.StrideC,
+                                                         AElementwiseOperation{},
+                                                         BElementwiseOperation{},
+                                                         CElementwiseOperation{});
+
+        return gemmPtr->IsSupportedArgument(argument_ptr.get());
+    }
 };
 
 } // namespace gemm_util
diff --git a/test/gemm/run_gemm_test.inc b/test/gemm/run_gemm_test.inc
index d208bb5a7b..0ab2a63367 100644
--- a/test/gemm/run_gemm_test.inc
+++ b/test/gemm/run_gemm_test.inc
@@ -1,13 +1,39 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
-int run_gemm_test()
+int run_gemm_test(int argc, char* argv[])
 {
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
 
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    ck::gemm_util::GemmParams params;
+    ck::index_t instance_index = -1;
+    if(argc == 1)
+    {
+        // use default params
+    }
+    else if(argc == 4 || argc == 5)
+    {
+        params.M       = atoi(argv[1]);
+        params.N       = atoi(argv[2]);
+        params.K       = atoi(argv[3]);
+        params.StrideA = params.M;
+        params.StrideB = params.N;
+        params.StrideC = params.K;
 
+        if(argc == 5)
+        {
+            instance_index = atoi(argv[4]);
+        }
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1-4: M N K instance_index(-1 means all)" << std::endl;
+    }
+    std::cout << "Params (M, N, K, index) " << params.M << " " << params.N << " " << params.K << " "
+              << instance_index << std::endl;
     auto test = [&](auto a_layout, auto b_layout, auto c_layout) {
         bool pass = true;
 
@@ -24,10 +50,31 @@ int run_gemm_test()
         const auto gemmPtrs =
             ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
                 DeviceOp>::GetInstances();
-
+        ck::index_t num_instance = 0;
         for(auto& gemmPtr : gemmPtrs)
         {
-            pass &= ck::gemm_util::TestGemm<AccDataType>{}(gemmPtr.get());
+            if(instance_index == -1)
+            {
+                pass &= ck::gemm_util::TestGemm<AccDataType>{}(gemmPtr.get(), params);
+            }
+            else
+            {
+                auto test_gemm = ck::gemm_util::TestGemm<AccDataType>{};
+                if(test_gemm.IsSupportedArgument(gemmPtr.get(), params))
+                {
+                    if(num_instance == instance_index)
+                    {
+                        pass &= test_gemm(gemmPtr.get(), params);
+                    }
+                    num_instance++;
+                }
+            }
+        }
+
+        if(instance_index != -1)
+        {
+            std::cout << "TestGemm_instance (" << instance_index << "/" << num_instance
+                      << "): " << (pass ? "Passed" : "Failed") << std::endl;
         }
 
         return pass;
diff --git a/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp b/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp
index 35a149f52c..b1f2c36c9f 100644
--- a/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp
@@ -4,9 +4,20 @@
 #include <iostream>
 
 #include "profiler/profile_gemm_reduce_impl.hpp"
-
-int main()
+static ck::index_t instance_index = -1;
+int main(int argc, char** argv)
 {
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
 
@@ -19,22 +30,22 @@ int main()
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Row, Row>(
-                   true, 1, false, false, M, N, K, K, N, N);
+                   true, 1, false, false, M, N, K, K, N, N, instance_index);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Col, Row>(
-                   true, 1, false, false, M, N, K, K, K, N);
+                   true, 1, false, false, M, N, K, K, K, N, instance_index);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Row, Row>(
-                   true, 1, false, false, M, N, K, M, N, N);
+                   true, 1, false, false, M, N, K, M, N, N, instance_index);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Col, Row>(
-                   true, 1, false, false, M, N, K, M, K, N);
+                   true, 1, false, false, M, N, K, M, K, N, instance_index);
 
     if(pass)
     {
diff --git a/test/gemm_split_k/test_gemm_splitk_util.hpp b/test/gemm_split_k/test_gemm_splitk_util.hpp
index 99d9d5e832..f994f146c7 100644
--- a/test/gemm_split_k/test_gemm_splitk_util.hpp
+++ b/test/gemm_split_k/test_gemm_splitk_util.hpp
@@ -15,6 +15,8 @@
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_splitk_impl.hpp"
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 namespace ck {
 namespace test {
 
@@ -48,8 +50,13 @@ class TestGemmSplitK : public testing::Test
              const int StrideB,
              const int StrideC)
     {
-        for(auto kb : k_batches_)
+        for(size_t i = 0; i < k_batches_.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto kb = k_batches_[i];
             RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
         }
     }
@@ -82,7 +89,8 @@ class TestGemmSplitK : public testing::Test
                                                                     StrideC,
                                                                     kbatch,
                                                                     n_warmup,
-                                                                    n_iter);
+                                                                    n_iter,
+                                                                    instance_index);
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/gemm_split_k/test_gemm_splitk_xdl.cpp b/test/gemm_split_k/test_gemm_splitk_xdl.cpp
index 9eba5bba37..3ff32977fa 100644
--- a/test/gemm_split_k/test_gemm_splitk_xdl.cpp
+++ b/test/gemm_split_k/test_gemm_splitk_xdl.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_splitk_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F16 = ck::half_t;
 using F32 = float;
 
@@ -64,3 +67,20 @@ TYPED_TEST_SUITE(TestGemmSplitK_KM_KN, KernelTypes);
 TYPED_TEST_SUITE(TestGemmSplitK_KM_NK, KernelTypes);
 
 #include "test_gemm_splitk_ut_cases.inc"
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_util.hpp b/test/gemm_universal/test_gemm_universal_util.hpp
index cb9bd4743d..12835805b3 100644
--- a/test/gemm_universal/test_gemm_universal_util.hpp
+++ b/test/gemm_universal/test_gemm_universal_util.hpp
@@ -14,7 +14,8 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_universal_impl.hpp"
-
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 namespace ck {
 namespace test {
 
@@ -49,8 +50,13 @@ class TestGemmUniversal : public testing::Test
              const int StrideB,
              const int StrideC)
     {
-        for(auto kb : k_batches_)
+        for(size_t i = 0; i < k_batches_.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto kb = k_batches_[i];
             RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
         }
     }
@@ -84,7 +90,8 @@ class TestGemmUniversal : public testing::Test
                                                                        StrideC,
                                                                        kbatch,
                                                                        n_warmup,
-                                                                       n_iter);
+                                                                       n_iter,
+                                                                       instance_index);
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
index 311c4de32d..5e7aa7ddc7 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
@@ -6,10 +6,11 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using I4   = ck::pk_i4_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using I4                   = ck::pk_i4_t;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -85,3 +86,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
 
 #include "test_gemm_universal_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
index 2f51253766..e530e5bbc2 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
@@ -6,10 +6,11 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using I4  = ck::pk_i4_t;
-using F8  = ck::f8_t;
-using F16 = ck::half_t;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using I4                   = ck::pk_i4_t;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;
 
 using F32 = float;
 
@@ -99,3 +100,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);
 
 #include "test_gemm_universal_ut_cases_fp16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
index 3484d49b93..81695258f6 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
@@ -6,7 +6,8 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 #if defined(CK_USE_WMMA_FP8)
 
 using F8   = ck::f8_t;
@@ -59,3 +60,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
 #include "test_gemm_universal_ut_cases_fp8.inc"
 
 #endif // CK_USE_WMMA_FP8
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
index 8fde65657a..9e643df7b8 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
@@ -6,9 +6,10 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -80,3 +81,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
 
 #include "test_gemm_universal_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
index 4eafb8c2e3..cabf6fb38d 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
@@ -6,9 +6,10 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using F8  = ck::f8_t;
-using F16 = ck::half_t;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;
 
 using F32 = float;
 
@@ -92,3 +93,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);
 
 #include "test_gemm_universal_ut_cases_fp16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
index e833ab7825..d99f25eb12 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
@@ -6,11 +6,12 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using F8   = ck::f8_t;
-using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -69,3 +70,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
 
 
 #include "test_gemm_universal_ut_cases_fp8.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
index 805587a274..6fdf8aa71f 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
@@ -15,6 +15,9 @@
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_universal_streamk_impl.hpp"
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 namespace ck {
 namespace test {
 
@@ -56,8 +59,13 @@ class TestGemmUniversal_Streamk : public testing::Test
              const int StrideB,
              const int StrideC)
     {
-        for(auto streamk_sel : streamk_sel_list)
+        for(size_t i = 0; i < streamk_sel_list.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto streamk_sel = streamk_sel_list[i];
             RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, -1);
         }
     }
@@ -93,7 +101,8 @@ class TestGemmUniversal_Streamk : public testing::Test
                                                                                streamk_sel,
                                                                                Grid_size,
                                                                                n_warmup,
-                                                                               n_iter);
+                                                                               n_iter,
+                                                                               instance_index);
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
index 1aef74cf18..5675413862 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using BF16 = ck::bhalf_t;
 using F32  = float;
 
@@ -83,3 +86,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_NK, KernelTypes_KM_NK);
 
 #include "test_gemm_universal_streamk_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
index 43b122ff0d..b6262c95c9 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F8  = ck::f8_t;
 using F16 = ck::half_t;
 
@@ -82,3 +85,20 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_NK, KernelTypes_MK_NK);
 
 #include "test_gemm_universal_streamk_ut_cases_fp16.inc"
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
index 3836de056c..a9ea93bfa6 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"
 
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F8   = ck::f8_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
@@ -72,3 +75,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_NK, KernelTypes_MK_NK);
 
 #include "test_gemm_universal_streamk_ut_cases_fp8.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
index 7ad7b78d6f..f335183a52 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
@@ -11,6 +11,9 @@
 
 #include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndBwdDataWmma : public ::testing::Test
 {
@@ -27,20 +30,27 @@ class TestGroupedConvndBwdDataWmma : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
-                                                                            OutLayout,
-                                                                            WeiLayout,
-                                                                            InLayout,
-                                                                            DataType,
-                                                                            DataType,
-                                                                            DataType>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                   OutLayout,
+                                                                                   WeiLayout,
+                                                                                   InLayout,
+                                                                                   DataType,
+                                                                                   DataType,
+                                                                                   DataType>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               1, // splitK
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -106,3 +116,20 @@ TYPED_TEST(TestGroupedConvndBwdDataWmma3d, Test3D)
         {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index 209b9b4f55..17839887bb 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -11,6 +11,9 @@
 
 #include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
 
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndBwdDataXdl : public ::testing::Test
 {
@@ -30,21 +33,27 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test
         bool pass = true;
         for(auto split_k : split_ks)
         {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
             {
-                pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
-                                                                                OutLayout,
-                                                                                WeiLayout,
-                                                                                InLayout,
-                                                                                DataType,
-                                                                                DataType,
-                                                                                DataType>(
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
+                pass        = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                       OutLayout,
+                                                                                       WeiLayout,
+                                                                                       InLayout,
+                                                                                       DataType,
+                                                                                       DataType,
+                                                                                       DataType>(
                                    true,  // do_verification
                                    1,     // init_method: integer value
                                    false, // do_log
                                    false, // time_kernel
                                    param,
-                                   split_k);
+                                   split_k,
+                                   instance_index);
             }
         }
         EXPECT_TRUE(pass);
@@ -149,3 +158,19 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
         {3, 1, 1, 1, 1, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp b/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
index 11748d4717..b3ed49ed8c 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
@@ -23,6 +23,8 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestGroupedConvndBwdWeight : public ::testing::Test
 {
@@ -83,7 +85,8 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
     }
 
     bool PerformConvWeightBilinear(ck::utils::conv::ConvParam& conv_param,
-                                   const ck::index_t split_k)
+                                   const ck::index_t split_k,
+                                   ck::index_t instance_index_ = -1)
     {
         bool passed = true;
 
@@ -163,6 +166,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
         // get device op instances
         const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
             DeviceOp>::GetInstances();
+        int num_kernel = 0;
 
         for(std::size_t i = 0; i < op_ptrs.size(); ++i)
         {
@@ -197,6 +201,12 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
             if(op_ptr->IsSupportedArgument(argument_ptr.get()))
             {
+                ++num_kernel;
+                if((instance_index_ != -1) && (instance_index_ + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
                 float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr});
                 wei_device_buf.FromDevice(wei_device.mData.data());
                 passed &= ck::utils::check_err(wei_device, wei_host, "Error: incorrect results!");
@@ -218,6 +228,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                 std::cerr << op_name << " does not support this problem" << std::endl;
             }
         }
+        if(instance_index != -1)
+        {
+            std::cout << "grouped_conv_bwd_weight_instance (" << instance_index << "/" << num_kernel
+                      << "): Passed" << std::endl;
+        }
         return passed;
     }
 
@@ -228,9 +243,14 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
         for(auto split_k : split_ks)
         {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
             {
-                pass = pass && PerformConvWeightBilinear(param, split_k);
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
+                pass        = pass && PerformConvWeightBilinear(param, split_k, instance_index);
             }
         }
         EXPECT_TRUE(pass);
@@ -268,3 +288,20 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D)
         {3, 1, 1, 4, 4, {3, 3, 3}, {14, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
index 8343629f3a..f0b3b28020 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -15,6 +15,9 @@
 
 #include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 using namespace ck::tensor_layout::convolution;
 
 template <typename Tuple>
@@ -92,8 +95,13 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
         for(auto split_k : split_ks)
         {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
             {
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
                 if(!skip_case(split_k))
                 {
                     pass = pass && ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial{},
@@ -108,7 +116,8 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                                        false, // do_log
                                        false, // time_kernel
                                        param,
-                                       std::to_string(split_k));
+                                       std::to_string(split_k),
+                                       instance_index);
                 }
             }
         }
@@ -224,3 +233,20 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D)
         {3, 16, 16, 1, 1, {3, 3, 3}, {28, 28, 28}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
index 1cf91df52c..ca78cf4af3 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -9,6 +9,9 @@
 
 #include "profiler/profile_grouped_conv_fwd_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
 {
@@ -26,23 +29,30 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                                       InLayout,
-                                                                       WeiLayout,
-                                                                       OutLayout,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       IndexType>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                              InLayout,
+                                                                              WeiLayout,
+                                                                              OutLayout,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              IndexType>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               ck::tensor_operation::element_wise::PassThrough{},
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -148,3 +158,20 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 96, 1, 1, 1, {3, 3, 3}, {4, 30, 160}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
index bf96d11d53..614f88d44e 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
@@ -11,7 +11,9 @@
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
-using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using BiasNormalizeInInferClamp   = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +32,13 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                   InLayout,
                                                                                   WeiLayout,
@@ -47,7 +54,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -95,3 +103,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
index e38a6d6f6a..af4f8b67f3 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +31,13 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                   InLayout,
                                                                                   WeiLayout,
@@ -47,7 +53,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -95,3 +102,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
index 7a59a95527..dcc3ec1cae 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +31,13 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
             pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                   InLayout,
                                                                                   WeiLayout,
@@ -47,7 +53,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -133,3 +140,19 @@ TYPED_TEST(TestGroupedConvndFwdBiasClamp3d, Test3D)
                                  {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
index 55c2e729cd..71fc017f1e 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using Clamp = ck::tensor_operation::element_wise::Clamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using Clamp                       = ck::tensor_operation::element_wise::Clamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -31,25 +32,31 @@ class TestGroupedConvndFwd : public ::testing::Test
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
         Clamp out_element_op{0.f, 256.f};
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                                       InLayout,
-                                                                       WeiLayout,
-                                                                       OutLayout,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       IndexType,
-                                                                       Clamp>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                              InLayout,
+                                                                              WeiLayout,
+                                                                              OutLayout,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              IndexType,
+                                                                              Clamp>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
                                param,
-                               out_element_op);
+                               out_element_op,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -97,3 +104,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
index 2400008ffa..23ab359648 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using BiasNormalizeInInferClamp   = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,9 +31,14 @@ class TestGroupedConvndFwd : public ::testing::Test
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
         {
-            pass = pass &&
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass &&
                    ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                           InLayout,
                                                                           WeiLayout,
@@ -48,7 +54,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                        1,     // init_method: integer value
                        false, // do_log
                        false, // time_kernel
-                       param);
+                       param,
+                       instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -96,3 +103,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
index cd4d90e243..33273568e1 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -47,7 +48,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                1,     // init_method: integer value
                                false, // do_log
                                false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
         }
         EXPECT_TRUE(pass);
     }
@@ -95,3 +97,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
         {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
index 74d49eb576..a44214be96 100644
--- a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
@@ -10,6 +10,9 @@
 #include "gtest/gtest.h"
 #include "test_grouped_gemm_util.hpp"
 
+ck::index_t param_mask     = 0xffffff;
+ck::index_t instance_index = -1;
+
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F8   = ck::f8_t;
@@ -42,3 +45,19 @@ using KernelTypes = ::testing::Types<
 TYPED_TEST_SUITE(TestGroupedGemm, KernelTypes);
 
 #include "test_grouped_gemm_ut_cases.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/grouped_gemm/test_grouped_gemm_util.hpp b/test/grouped_gemm/test_grouped_gemm_util.hpp
index 7d024d8720..e6a9981671 100644
--- a/test/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/grouped_gemm/test_grouped_gemm_util.hpp
@@ -23,6 +23,9 @@
 #include "ck/utility/number.hpp"
 #include "profiler/profile_grouped_gemm_impl.hpp"
 
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 namespace ck {
 namespace test {
 
@@ -109,8 +112,16 @@ class TestGroupedGemm : public testing::Test
         {
             SetStrides<ELayout>(stride_cs, Ms, Ns);
         }
+        std::vector<int> k_batches;
+        for(size_t i = 0; i < k_batches_.size(); i++)
+        {
+            if(param_mask & (1 << i))
+            {
+                k_batches.push_back(k_batches_[i]);
+            }
+        }
 
-        RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches_);
+        RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches);
     }
 
     void RunSingle(const std::vector<int>& Ms,
@@ -139,7 +150,8 @@ class TestGroupedGemm : public testing::Test
                                                                      StrideCs,
                                                                      kbatches,
                                                                      n_warmup_,
-                                                                     n_iter_);
+                                                                     n_iter_,
+                                                                     instance_index);
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/magic_number_division/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp
index 253f21e91f..d21d57a157 100644
--- a/test/magic_number_division/magic_number_division.cpp
+++ b/test/magic_number_division/magic_number_division.cpp
@@ -56,10 +56,27 @@ __host__ void cpu_magic_number_division(uint32_t magic_multiplier,
     }
 }
 
-int main(int, char*[])
+int main(int argc, char* argv[])
 {
-    uint64_t num_divisor  = 4096;
-    uint64_t num_dividend = 1L << 16;
+    uint64_t num_divisor   = 4096;
+    uint64_t num_dividend  = 1L << 16;
+    uint32_t divisor_start = 0;
+    uint32_t divisor_end   = num_divisor;
+
+    if(argc == 1)
+    {
+        // use default range
+    }
+    else if(argc == 3)
+    {
+        divisor_start = std::stoi(argv[1]);
+        divisor_end   = std::stoi(argv[2]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 2: divisor_start divisor_end" << std::endl;
+        return 1;
+    }
 
     std::vector<int32_t> divisors_host(num_divisor);
     std::vector<int32_t> dividends_host(num_dividend);
@@ -90,6 +107,10 @@ int main(int, char*[])
 
     for(std::size_t i = 0; i < num_divisor; ++i)
     {
+        if(i < divisor_start || i > divisor_end)
+        {
+            continue;
+        }
         // run naive division on GPU
         gpu_naive_division<<<1024, 256>>>(
             divisors_host[i],
diff --git a/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp b/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp
index a7860955cd..e6b5c918ba 100644
--- a/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp
+++ b/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestgroupnormBwdData : public ::testing::Test
 {
@@ -29,15 +32,20 @@ class TestgroupnormBwdData : public ::testing::Test
                                                          {1, 32, 32, 32, 20},
                                                          {1, 16, 16, 32, 40}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_groupnorm_bwd_data_impl<DYDataType,
                                                                          XDataType,
                                                                          GammaDataType,
                                                                          MeanInvStdDataType,
                                                                          ComputeDataType,
                                                                          DXDataType>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -49,3 +57,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestgroupnormBwdData, KernelTypes);
 TYPED_TEST(TestgroupnormBwdData, Test_FP32) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp b/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp
index 870f24d064..6786c83938 100644
--- a/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp
+++ b/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2dBwdData : public ::testing::Test
 {
@@ -25,16 +28,21 @@ class TestLayernorm2dBwdData : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
-            bool success =
-                ck::profiler::profile_layernorm_bwd_data_impl<DYDataType,
-                                                              XDataType,
-                                                              GammaDataType,
-                                                              MeanInvStdDataType,
-                                                              ComputeDataType,
-                                                              DXDataType,
-                                                              2>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_layernorm_bwd_data_impl<DYDataType,
+                                                                         XDataType,
+                                                                         GammaDataType,
+                                                                         MeanInvStdDataType,
+                                                                         ComputeDataType,
+                                                                         DXDataType,
+                                                                         2>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +54,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm2dBwdData, KernelTypes);
 TYPED_TEST(TestLayernorm2dBwdData, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp b/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
index 53c92413b1..6123efbe8d 100644
--- a/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
+++ b/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2dBwdGammaBeta : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2dBwdGammaBeta : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_layernorm_bwd_gamma_beta_impl<DYDataType,
                                                                                XDataType,
                                                                                MeanInvStdDataType,
@@ -34,7 +42,7 @@ class TestLayernorm2dBwdGammaBeta : public ::testing::Test
                                                                                DGammaDataType,
                                                                                DBetaDataType,
                                                                                2>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +54,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm2dBwdGammaBeta, KernelTypes);
 TYPED_TEST(TestLayernorm2dBwdGammaBeta, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp b/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
index c31161fb33..e835668bd6 100644
--- a/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
@@ -31,16 +34,21 @@ class TestGroupnorm : public ::testing::Test
                                                          {2, 32, 32, 32, 40},
                                                          {1, 16, 16, 32, 40}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
-            bool success =
-                ck::profiler::profile_groupnorm_impl<XDataType,
-                                                     GammaDataType,
-                                                     BetaDataType,
-                                                     ComputeDataType,
-                                                     YDataType,
-                                                     SaveMeanInvStdDataType,
-                                                     true>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_groupnorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -52,3 +60,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
 TYPED_TEST(TestGroupnorm, Test_FP16) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp b/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
index 08d835ed37..fcb9102fac 100644
--- a/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
+++ b/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
@@ -29,16 +32,21 @@ class TestGroupnorm : public ::testing::Test
                                                          {1, 32, 32, 32, 20},
                                                          {1, 16, 16, 32, 40}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
-            bool success =
-                ck::profiler::profile_groupnorm_impl<XDataType,
-                                                     GammaDataType,
-                                                     BetaDataType,
-                                                     ComputeDataType,
-                                                     YDataType,
-                                                     SaveMeanInvStdDataType,
-                                                     true>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_groupnorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -50,3 +58,20 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
 TYPED_TEST(TestGroupnorm, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp b/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
index 3234b2e159..1d8bd560b7 100644
--- a/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2d : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                 GammaDataType,
                                                                 BetaDataType,
@@ -34,7 +42,8 @@ class TestLayernorm2d : public ::testing::Test
                                                                 YDataType,
                                                                 SaveMeanInvStdDataType,
                                                                 true,
-                                                                2>(true, 2, false, false, length);
+                                                                2>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +55,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
 TYPED_TEST(TestLayernorm2d, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp b/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
index b46715d96a..10ffeb762e 100644
--- a/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
+++ b/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2d : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                 GammaDataType,
                                                                 BetaDataType,
@@ -34,7 +42,8 @@ class TestLayernorm2d : public ::testing::Test
                                                                 YDataType,
                                                                 SaveMeanInvStdDataType,
                                                                 true,
-                                                                2>(true, 2, false, false, length);
+                                                                2>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +55,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
 TYPED_TEST(TestLayernorm2d, Test_FP32) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp b/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
index d1a7b9e3df..b7355de96b 100644
--- a/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;
 
+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm4d : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm4d : public ::testing::Test
         std::vector<std::vector<ck::index_t>> lengths = {
             {1, 1, 1, 1}, {7, 7, 7, 7}, {256, 16, 16, 8}};
 
-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
         {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
             bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                 GammaDataType,
                                                                 BetaDataType,
@@ -34,7 +42,8 @@ class TestLayernorm4d : public ::testing::Test
                                                                 YDataType,
                                                                 SaveMeanInvStdDataType,
                                                                 true,
-                                                                4>(true, 2, false, false, length);
+                                                                4>(
+                true, 2, false, false, length, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -46,3 +55,19 @@ using KernelTypes = ::testing::Types<
 
 TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes);
 TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_avg_pool2d_bwd.cpp b/test/pool/test_avg_pool2d_bwd.cpp
index 0866325fc6..54c75a5553 100644
--- a/test/pool/test_avg_pool2d_bwd.cpp
+++ b/test/pool/test_avg_pool2d_bwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_avg_pool2d_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename T>
 class AvgPool2dBWDTest : public ::testing::Test
 {
@@ -16,8 +19,13 @@ class AvgPool2dBWDTest : public ::testing::Test
 
     void Run()
     {
-        for(auto param : this->params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_avg_pool2d_bwd_impl<InDataType, OutDataType, NHWC, NHWC>(
                     true,
@@ -29,7 +37,8 @@ class AvgPool2dBWDTest : public ::testing::Test
                     param.window_strides_,
                     param.window_dilations_,
                     param.input_left_pads_,
-                    param.input_right_pads_);
+                    param.input_right_pads_,
+                    instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -131,3 +140,20 @@ TYPED_TEST(AvgPool2D_f16, AvgPool2DTest_f16) { this->Run(); }
 TYPED_TEST(AvgPool2D_bf16, AvgPool2DTest_bf16) { this->Run(); }
 
 TYPED_TEST(AvgPool2D_f8, AvgPool2DTest_f8) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_avg_pool2d_fwd.cpp b/test/pool/test_avg_pool2d_fwd.cpp
index b5e733419a..ba78973042 100644
--- a/test/pool/test_avg_pool2d_fwd.cpp
+++ b/test/pool/test_avg_pool2d_fwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_pool2d_fwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestAvgPool2dFwd : public ::testing::Test
 {
@@ -18,8 +21,13 @@ class TestAvgPool2dFwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_pool2d_fwd_impl<InDataType,
                                                       OutDataType,
@@ -38,7 +46,8 @@ class TestAvgPool2dFwd : public ::testing::Test
                                                              param.window_strides_,
                                                              param.window_dilations_,
                                                              param.input_left_pads_,
-                                                             param.input_right_pads_);
+                                                             param.input_right_pads_,
+                                                             instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -143,3 +152,19 @@ TYPED_TEST(AvgPool2D_F16, AvgPool2D_F16_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_BF16, AvgPool2D_BF16_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_I8, AvgPool2D_I8_Test) { this->Run(); }
 TYPED_TEST(AvgPool2D_F8, AvgPool2D_F8_Test) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_avg_pool3d_bwd.cpp b/test/pool/test_avg_pool3d_bwd.cpp
index fbd03fdf45..7fa1c4907a 100644
--- a/test/pool/test_avg_pool3d_bwd.cpp
+++ b/test/pool/test_avg_pool3d_bwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_avg_pool3d_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestAvgPool3dBwd : public ::testing::Test
 {
@@ -19,8 +22,13 @@ class TestAvgPool3dBwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_avg_pool3d_bwd_impl<DOutDataType,
                                                           DInDataType,
@@ -35,7 +43,8 @@ class TestAvgPool3dBwd : public ::testing::Test
                                                                      param.window_strides_,
                                                                      param.window_dilations_,
                                                                      param.input_left_pads_,
-                                                                     param.input_right_pads_);
+                                                                     param.input_right_pads_,
+                                                                     instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -72,3 +81,19 @@ TYPED_TEST(TestAvgPool3dBwd, Test_Pool)
 
     this->Run();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_avg_pool3d_fwd.cpp b/test/pool/test_avg_pool3d_fwd.cpp
index 378b05399e..12e83f8e5f 100644
--- a/test/pool/test_avg_pool3d_fwd.cpp
+++ b/test/pool/test_avg_pool3d_fwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_pool3d_fwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestAvgPool3dFwd : public ::testing::Test
 {
@@ -20,8 +23,13 @@ class TestAvgPool3dFwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
                                                             param.window_spatial_lengths_,
                                                             param.window_strides_,
@@ -38,7 +46,8 @@ class TestAvgPool3dFwd : public ::testing::Test
                                                       ck::tensor_layout::convolution::NDHWC,
                                                       ck::ReduceTensorOp::AVG,
                                                       false,
-                                                      false>(in_params_avg_pool, kernel_params);
+                                                      false>(
+                    in_params_avg_pool, kernel_params, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -61,3 +70,19 @@ TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
 
     this->Run();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_max_pool2d_bwd.cpp b/test/pool/test_max_pool2d_bwd.cpp
index 65a897dd5b..e6a53d0d64 100644
--- a/test/pool/test_max_pool2d_bwd.cpp
+++ b/test/pool/test_max_pool2d_bwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_max_pool2d_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename T>
 class MaxPool2dBWDTest : public ::testing::Test
 {
@@ -20,8 +23,13 @@ class MaxPool2dBWDTest : public ::testing::Test
 
     void Run()
     {
-        for(auto param : this->params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_max_pool2d_bwd_impl<InDataType,
                                                           OutDataType,
@@ -37,7 +45,8 @@ class MaxPool2dBWDTest : public ::testing::Test
                                                                  param.window_strides_,
                                                                  param.window_dilations_,
                                                                  param.input_left_pads_,
-                                                                 param.input_right_pads_);
+                                                                 param.input_right_pads_,
+                                                                 instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -137,3 +146,20 @@ TYPED_TEST(MaxPool2D_f16, MaxPool2DTest_f16) { this->Run(); }
 TYPED_TEST(MaxPool2D_bf16, MaxPool2DTest_bf16) { this->Run(); }
 
 TYPED_TEST(MaxPool2D_f8, MaxPool2DTest_f8) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_max_pool2d_fwd.cpp b/test/pool/test_max_pool2d_fwd.cpp
index bb6fc96cb1..4bf2a1cf8d 100644
--- a/test/pool/test_max_pool2d_fwd.cpp
+++ b/test/pool/test_max_pool2d_fwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_pool2d_fwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestMaxPool2dFwd : public ::testing::Test
 {
@@ -19,8 +22,13 @@ class TestMaxPool2dFwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             // max pool
             bool success =
                 ck::profiler::profile_pool2d_fwd_impl<InDataType,
@@ -40,7 +48,8 @@ class TestMaxPool2dFwd : public ::testing::Test
                                                                    param.window_strides_,
                                                                    param.window_dilations_,
                                                                    param.input_left_pads_,
-                                                                   param.input_right_pads_);
+                                                                   param.input_right_pads_,
+                                                                   instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -148,3 +157,20 @@ TYPED_TEST(MaxPool2D_F16, MaxPool2D_F16_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_BF16, MaxPool2D_BF16_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_I8, MaxPool2D_I8_Test) { this->Run(); }
 TYPED_TEST(MaxPool2D_F8, MaxPool2D_F8_Test) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_max_pool3d_bwd.cpp b/test/pool/test_max_pool3d_bwd.cpp
index 8d52bde4da..1ae2270272 100644
--- a/test/pool/test_max_pool3d_bwd.cpp
+++ b/test/pool/test_max_pool3d_bwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_max_pool3d_bwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestMaxPool3dBwd : public ::testing::Test
 {
@@ -20,8 +23,13 @@ class TestMaxPool3dBwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
             bool success =
                 ck::profiler::profile_max_pool3d_bwd_impl<InDataType,
                                                           OutDataType,
@@ -37,7 +45,8 @@ class TestMaxPool3dBwd : public ::testing::Test
                                                                  param.window_strides_,
                                                                  param.window_dilations_,
                                                                  param.input_left_pads_,
-                                                                 param.input_right_pads_);
+                                                                 param.input_right_pads_,
+                                                                 instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -77,3 +86,20 @@ TYPED_TEST(TestMaxPool3dBwd, Test_Pool)
 
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/pool/test_max_pool3d_fwd.cpp b/test/pool/test_max_pool3d_fwd.cpp
index d7602f9acd..e7f5614d12 100644
--- a/test/pool/test_max_pool3d_fwd.cpp
+++ b/test/pool/test_max_pool3d_fwd.cpp
@@ -5,6 +5,9 @@
 #include "profiler/profile_pool3d_fwd_impl.hpp"
 #include "test_pool_fwd_common.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestMaxPool3dFwd : public ::testing::Test
 {
@@ -21,8 +24,14 @@ class TestMaxPool3dFwd : public ::testing::Test
 
     void Run()
     {
-        for(auto param : params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = this->params[i];
+
             ck::profiler::PoolFwdKernelParams kernel_params{param.length_,
                                                             param.window_spatial_lengths_,
                                                             param.window_strides_,
@@ -40,7 +49,8 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                       ck::tensor_layout::convolution::NDHWC,
                                                       ck::ReduceTensorOp::MAX,
                                                       false,
-                                                      false>(in_params_max_pool, kernel_params);
+                                                      false>(
+                    in_params_max_pool, kernel_params, instance_index);
             EXPECT_TRUE(success);
 
             // max pool + index
@@ -52,8 +62,8 @@ class TestMaxPool3dFwd : public ::testing::Test
                                                             ck::tensor_layout::convolution::NDHWC,
                                                             ck::ReduceTensorOp::MAX,
                                                             false,
-                                                            true>(in_params_max_pool_indexed,
-                                                                  kernel_params);
+                                                            true>(
+                in_params_max_pool_indexed, kernel_params, instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -76,3 +86,20 @@ TYPED_TEST(TestMaxPool3dFwd, Test_Pool)
 
     this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 9153805889..20f7ee3d57 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -8,6 +8,9 @@
 #include <gtest/gtest.h>
 using namespace ck;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 struct ReduceParam
 {
     bool do_verification{true};
@@ -53,8 +56,13 @@ class ReduceWithIndexTest : public ::testing::Test
     template <ReduceTensorOp ReduceOpIdType>
     void Run()
     {
-        for(auto param : this->params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param  = this->params[i];
             bool success = ck::profiler::profile_reduce_impl<InDataType, AccDataType, OutDataType>(
                 param.do_verification,
                 param.init_method,
@@ -66,7 +74,8 @@ class ReduceWithIndexTest : public ::testing::Test
                 param.propagateNan,
                 param.useIndex,
                 param.alpha,
-                param.beta);
+                param.beta,
+                instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -201,3 +210,20 @@ TYPED_TEST(ReduceWithNoIndexBHalfFloat, ReduceWithNoIndexTestBHalfFloat_MAX)
     // trigger Run() -> Generic
     this->template Run<ReduceTensorOp::MAX>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index 796211a69a..fa539b4026 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -8,6 +8,9 @@
 #include <gtest/gtest.h>
 using namespace ck;
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 struct ReduceParam
 {
     bool do_verification{true};
@@ -53,8 +56,13 @@ class ReduceWithIndexTest : public ::testing::Test
     template <ReduceTensorOp ReduceOpIdType>
     void Run()
     {
-        for(auto param : this->params)
+        for(size_t i = 0; i < this->params.size(); i++)
         {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param  = this->params[i];
             bool success = ck::profiler::profile_reduce_impl<InDataType, AccDataType, OutDataType>(
                 param.do_verification,
                 param.init_method,
@@ -66,7 +74,8 @@ class ReduceWithIndexTest : public ::testing::Test
                 param.propagateNan,
                 param.useIndex,
                 param.alpha,
-                param.beta);
+                param.beta,
+                instance_index);
             EXPECT_TRUE(success);
         }
     }
@@ -201,3 +210,20 @@ TYPED_TEST(ReduceWithIndexBHalfFloat, ReduceWithIndexTestBHalfFloat_MAX)
     // trigger Run() -> Generic
     this->template Run<ReduceTensorOp::MAX>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/softmax/test_softmax_ut_cases.inc b/test/softmax/test_softmax_ut_cases.inc
index cf5e4d2d2d..46154eb445 100644
--- a/test/softmax/test_softmax_ut_cases.inc
+++ b/test/softmax/test_softmax_ut_cases.inc
@@ -58,3 +58,20 @@ TYPED_TEST(TestSoftmax, ReduceOddLengths)
     this->Run({this->Rank - 1});
     this->Run({this->Rank - 2});
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
index 1409af8453..96c8fe588d 100644
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -15,6 +15,9 @@
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_softmax_impl.hpp"
 
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 namespace ck {
 
 template <typename Range>
@@ -56,7 +59,8 @@ class TestSoftmax : public ::testing::Test
     void RunSingle(std::vector<index_t> in_length,
                    std::vector<index_t> reduce_dims,
                    AccDataType alpha,
-                   AccDataType beta)
+                   AccDataType beta,
+                   index_t instance_index)
     {
         int init_method = 1; // integer value initialization
         bool log        = false;
@@ -67,84 +71,98 @@ class TestSoftmax : public ::testing::Test
         {
             if(reduce_dims.size() == 1)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 2)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 3)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
         }
         else if constexpr(Rank == 4)
         {
             if(reduce_dims.size() == 1)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 2)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 3)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
             else if(reduce_dims.size() == 4)
                 pass = ck::profiler::
-                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(verify_,
-                                                                                        init_method,
-                                                                                        log,
-                                                                                        bench_,
-                                                                                        in_length,
-                                                                                        strides,
-                                                                                        reduce_dims,
-                                                                                        alpha,
-                                                                                        beta);
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(
+                        verify_,
+                        init_method,
+                        log,
+                        bench_,
+                        in_length,
+                        strides,
+                        reduce_dims,
+                        alpha,
+                        beta,
+                        instance_index);
         };
 
         EXPECT_TRUE(pass);
@@ -161,7 +179,7 @@ class TestSoftmax : public ::testing::Test
         {
             for(auto scale : this->scales_)
             {
-                this->RunSingle(in_length, reduce_dims, scale[0], scale[1]);
+                this->RunSingle(in_length, reduce_dims, scale[0], scale[1], instance_index);
             }
         }
     }