[CK] Add command option instance_index and param_mask to run partial ck test (#2889)

* [CK] Add command option instance_index and param_mask to run partial ck test Many CK test are instance test. it will loop all instance in the instance library. It causes test often out-of-time if we run test on simulator/emulator. This PR add option instance_index and param_mask to reduce the workload of instance test instance_index: only run test 1 available instance with specified index. param_mask: filter the embedded parameter with specified mask * fix CI error * fix clang format --------- Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com> [ROCm/composable_kernel commit: e78a897ec0]
2026-05-14 02:02:46 +00:00 · 2025-09-30 23:24:40 +08:00
parent 780456f1ce
commit 6c4ff0b062
113 changed files with 2804 additions and 704 deletions
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
    const std::array<int, 2> reduceDims = {3, 4};
    // const std::array<int, 3> invariantDims = {0, 1, 2};

-    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
+    std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};

    // input lengths of the second reduction, which is also the output lengths of the first
    // reduction
-    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
+    std::vector<size_t> inLengths_2 = {64, 320, 80, 4};

-    const std::vector<size_t> outLengths = {64, 320, 80};
+    std::vector<size_t> outLengths = {64, 320, 80};

    if(argc == 1)
    {
@@ -114,11 +114,26 @@ int main(int argc, char* argv[])
        init_method = 2;
        time_kernel = true;
    }
-    else if(argc == 4)
+    else if((argc == 4) || (argc == 9))
    {
        do_verify   = static_cast<bool>(argv[1]);
        init_method = atoi(argv[2]);
        time_kernel = static_cast<bool>(atoi(argv[3]));
+        if(argc == 9)
+        {
+            inLengths_1[0] = atoi(argv[4]);
+            inLengths_1[1] = atoi(argv[5]);
+            inLengths_1[2] = atoi(argv[6]);
+            inLengths_1[3] = atoi(argv[7]);
+            inLengths_1[4] = atoi(argv[8]);
+            inLengths_2[0] = inLengths_1[0];
+            inLengths_2[1] = inLengths_1[1];
+            inLengths_2[2] = inLengths_1[2];
+            inLengths_2[3] = inLengths_1[3];
+            outLengths[0]  = inLengths_1[0];
+            outLengths[1]  = inLengths_1[1];
+            outLengths[2]  = inLengths_1[2];
+        }
    }
    else
    {
--- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
@@ -50,14 +50,14 @@ template<> struct emb_kernel<ck::half_t, 8192> { using kernel_type = DeviceInsta

 // clang-format on

-int main()
+int main(int argc, char* argv[])
 {
    bool time_kernel = true;

-    constexpr auto num_rows = 65536;
-    constexpr auto dims     = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{};
-    // constexpr auto dims = ck::Sequence<256, 512>{};
-    constexpr auto index_length   = 2048;
+    ck::index_t num_rows          = 65536;
+    constexpr auto dims           = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{};
+    ck::index_t index_length      = 2048;
+    ck::index_t dim_mask          = 0xffff;
    constexpr AccDataType epsilon = 1e-4;

    auto f_host_tensor_desc_1d = [](std::size_t len_) { return HostTensorDescriptor({len_}); };
@@ -73,121 +73,140 @@ int main()
                                                                              BetaDataType,
                                                                              AccDataType,
                                                                              OutType>;
-
+    if(argc == 1)
+    {
+        // Use default value
+    }
+    else if(argc == 4)
+    {
+        num_rows     = atoi(argv[1]);
+        dim_mask     = strtol(argv[2], nullptr, 0);
+        index_length = atoi(argv[3]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1-3: num_rows dim_mask index_length" << std::endl;
+    }
    ck::static_for<0, dims.Size(), 1>{}([&](auto I) {
-        std::srand(std::time(nullptr));
-        constexpr auto current_dim = dims.At(I);
-        Tensor<EmbType> emb_a(f_host_tensor_desc_2d(num_rows, current_dim));
-        Tensor<EmbType> emb_b(f_host_tensor_desc_2d(num_rows, current_dim));
-        Tensor<EmbType> emb_c(f_host_tensor_desc_2d(num_rows, current_dim));
-
-        Tensor<IndexType> index_a(f_host_tensor_desc_1d(index_length));
-        Tensor<IndexType> index_b(f_host_tensor_desc_1d(index_length));
-        Tensor<IndexType> index_c(f_host_tensor_desc_1d(index_length));
-
-        Tensor<GammaDataType> gamma(f_host_tensor_desc_1d(current_dim));
-        Tensor<BetaDataType> beta(f_host_tensor_desc_1d(current_dim));
-
-        Tensor<OutType> out(f_host_tensor_desc_2d(index_length, current_dim));
-
-        emb_a.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-        emb_b.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-        emb_c.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
-
-        index_a.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-        index_b.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-        index_c.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
-
-        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
-        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
-
-        DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize());
-        DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize());
-        DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize());
-
-        DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize());
-        DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize());
-        DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize());
-
-        DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
-        DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
-
-        DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize());
-
-        emb_a_dev.ToDevice(emb_a.mData.data());
-        emb_b_dev.ToDevice(emb_b.mData.data());
-        emb_c_dev.ToDevice(emb_c.mData.data());
-
-        index_a_dev.ToDevice(index_a.mData.data());
-        index_b_dev.ToDevice(index_b.mData.data());
-        index_c_dev.ToDevice(index_c.mData.data());
-
-        gamma_dev.ToDevice(gamma.mData.data());
-        beta_dev.ToDevice(beta.mData.data());
-
-        auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
-        auto argument_ptr    = device_instance.MakeArgumentPointer(
-            out_dev.GetDeviceBuffer(),
-            {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
-                ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
-                ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
-            {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
-                ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
-                ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
-            gamma_dev.GetDeviceBuffer(),
-            beta_dev.GetDeviceBuffer(),
-            current_dim,
-            index_length,
-            epsilon,
-            EmbElementwiseOperation{});
-        std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
-                  << std::endl
-                  << std::flush;
-
-        bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get());
-
-        if(!is_supported)
+        if(dim_mask & (1 << I.value))
        {
-            std::cout << "Runtime parameters are not supported" << std::endl;
-            return;
+            std::srand(std::time(nullptr));
+            constexpr auto current_dim = dims.At(I);
+            Tensor<EmbType> emb_a(f_host_tensor_desc_2d(num_rows, current_dim));
+            Tensor<EmbType> emb_b(f_host_tensor_desc_2d(num_rows, current_dim));
+            Tensor<EmbType> emb_c(f_host_tensor_desc_2d(num_rows, current_dim));
+
+            Tensor<IndexType> index_a(f_host_tensor_desc_1d(index_length));
+            Tensor<IndexType> index_b(f_host_tensor_desc_1d(index_length));
+            Tensor<IndexType> index_c(f_host_tensor_desc_1d(index_length));
+
+            Tensor<GammaDataType> gamma(f_host_tensor_desc_1d(current_dim));
+            Tensor<BetaDataType> beta(f_host_tensor_desc_1d(current_dim));
+
+            Tensor<OutType> out(f_host_tensor_desc_2d(index_length, current_dim));
+
+            emb_a.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+            emb_b.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+            emb_c.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+
+            index_a.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+            index_b.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+            index_c.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+
+            gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+            beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+
+            DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize());
+            DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize());
+            DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize());
+
+            DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize());
+            DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize());
+            DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize());
+
+            DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+            DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+
+            DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize());
+
+            emb_a_dev.ToDevice(emb_a.mData.data());
+            emb_b_dev.ToDevice(emb_b.mData.data());
+            emb_c_dev.ToDevice(emb_c.mData.data());
+
+            index_a_dev.ToDevice(index_a.mData.data());
+            index_b_dev.ToDevice(index_b.mData.data());
+            index_c_dev.ToDevice(index_c.mData.data());
+
+            gamma_dev.ToDevice(gamma.mData.data());
+            beta_dev.ToDevice(beta.mData.data());
+
+            auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
+            auto argument_ptr    = device_instance.MakeArgumentPointer(
+                out_dev.GetDeviceBuffer(),
+                {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
+                    ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
+                    ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
+                {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
+                    ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
+                    ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
+                gamma_dev.GetDeviceBuffer(),
+                beta_dev.GetDeviceBuffer(),
+                current_dim,
+                index_length,
+                epsilon,
+                EmbElementwiseOperation{});
+            std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
+                      << std::endl
+                      << std::flush;
+
+            bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get());
+
+            if(!is_supported)
+            {
+                std::cout << "Runtime parameters are not supported" << std::endl;
+                return;
+            }
+
+            auto invoker_ptr = device_instance.MakeInvokerPointer();
+            float time_ms =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            bool pass = true;
+            {
+                Tensor<OutType> out_from_dev(f_host_tensor_desc_2d(index_length, current_dim));
+                ReferenceInstance ref;
+                auto ref_argument = ref.MakeArgument(out,
+                                                     emb_a,
+                                                     emb_b,
+                                                     emb_c,
+                                                     index_a,
+                                                     index_b,
+                                                     index_c,
+                                                     gamma,
+                                                     beta,
+                                                     num_rows,
+                                                     current_dim,
+                                                     index_length,
+                                                     epsilon);
+                auto ref_invoker  = ref.MakeInvoker();
+                ref_invoker.Run(ref_argument);
+
+                out_dev.FromDevice(out_from_dev.mData.data());
+                pass &=
+                    ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3);
+            }
+
+            double total_read = current_dim * index_length * 3 * sizeof(EmbType) +
+                                current_dim * sizeof(GammaDataType) +
+                                current_dim * sizeof(BetaDataType);
+            double total_write = current_dim * index_length * sizeof(OutType);
+            double gbps        = (total_read + total_write) / time_ms / 1e6;
+
+            std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms
+                      << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl
+                      << std::flush;
        }
-
-        auto invoker_ptr = device_instance.MakeInvokerPointer();
-        float time_ms    = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-        bool pass = true;
-        {
-            Tensor<OutType> out_from_dev(f_host_tensor_desc_2d(index_length, current_dim));
-            ReferenceInstance ref;
-            auto ref_argument = ref.MakeArgument(out,
-                                                 emb_a,
-                                                 emb_b,
-                                                 emb_c,
-                                                 index_a,
-                                                 index_b,
-                                                 index_c,
-                                                 gamma,
-                                                 beta,
-                                                 num_rows,
-                                                 current_dim,
-                                                 index_length,
-                                                 epsilon);
-            auto ref_invoker  = ref.MakeInvoker();
-            ref_invoker.Run(ref_argument);
-
-            out_dev.FromDevice(out_from_dev.mData.data());
-            pass &= ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3);
-        }
-
-        double total_read = current_dim * index_length * 3 * sizeof(EmbType) +
-                            current_dim * sizeof(GammaDataType) +
-                            current_dim * sizeof(BetaDataType);
-        double total_write = current_dim * index_length * sizeof(OutType);
-        double gbps        = (total_read + total_write) / time_ms / 1e6;
-
-        std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms
-                  << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl
-                  << std::flush;
    });

    return 0;
--- a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
@@ -68,6 +68,24 @@ int main(int argc, char* argv[])
    }

    std::vector<std::size_t> nchw = {16, 128, 32, 64};
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        nchw[0] = std::stoi(argv[1]);
+        nchw[1] = std::stoi(argv[2]);
+        nchw[2] = std::stoi(argv[3]);
+        nchw[3] = std::stoi(argv[4]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 4: N, C, H, W" << std::endl;
+
+        return 1;
+    }
+
    std::array<ck::index_t, 4> ab_lengths;
    std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
                                             static_cast<int>(nchw[2] * nchw[3]),
--- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -98,8 +98,23 @@ int main(int argc, char* argv[])
        exit(0);
    }

-    ck::index_t M      = 48 * 256;
-    ck::index_t N      = 1024;
+    ck::index_t M = 48 * 256;
+    ck::index_t N = 1024;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 3)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 2: M, N" << std::endl;
+        return 1;
+    }
+
    ck::index_t Stride = N;

    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
--- a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp
+++ b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp
@@ -100,7 +100,7 @@ using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizatio
    4,     // DGammaDstVectorSize
    4>;    // DBetaDstVectorSize

-int main()
+int main(int argc, char* argv[])
 {
    bool time_kernel = false;

@@ -110,6 +110,25 @@ int main()
    ck::index_t G = 32;
    ck::index_t C = 64;

+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 6)
+    {
+        N = std::stoi(argv[1]);
+        H = std::stoi(argv[2]);
+        W = std::stoi(argv[3]);
+        G = std::stoi(argv[4]);
+        C = std::stoi(argv[5]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+
+        return 1;
+    }
+
    Tensor<DYDataType> dy({N, H, W, G, C});
    Tensor<XDataType> x({N, H, W, G, C});
    Tensor<GammaDataType> gamma({G, C});
--- a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp
@@ -39,7 +39,8 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
                                 std::vector<index_t> window_strides,
                                 std::vector<index_t> window_dilations,
                                 std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
    constexpr index_t InOutRank  = 4;
    constexpr index_t WindowRank = 2;
@@ -166,6 +167,11 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
        {
            ++num_kernel;
            instance_found = true;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -249,7 +255,11 @@ bool profile_avg_pool2d_bwd_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "avg_pool2d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return pass && instance_found;
 }

--- a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
@@ -48,7 +48,8 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
                                 std::vector<index_t> window_strides,
                                 std::vector<index_t> window_dilations,
                                 std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
    constexpr index_t InOutRank  = 5;
    constexpr index_t WindowRank = 3;
@@ -166,6 +167,11 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -246,7 +252,11 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "avg_pool3d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return true;
 }

--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
@@ -49,10 +49,10 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
                                                         int O,
                                                         int G0,
                                                         int G1,
-                                                         float alpha = -1.f)
+                                                         float alpha        = -1.f,
+                                                         int instance_index = -1)

 {
-
    using PassThrough   = tensor_operation::element_wise::PassThrough;
    using ScaleAdd      = tensor_operation::element_wise::ScaleAdd;
    using AElementOp    = PassThrough;
@@ -277,7 +277,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
    {
@@ -314,6 +314,13 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
            std::string op_name = op_ptr->GetTypeString();

            float ave_time =
@@ -392,6 +399,11 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_bias_softmax_gemm_permute_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -47,7 +47,8 @@ bool profile_batched_gemm_impl(int do_verification,
                               int BatchStrideA,
                               int BatchStrideB,
                               int BatchStrideC,
-                               int BatchCount)
+                               int BatchCount,
+                               int instance_index = -1)
 {
    bool pass = true;

@@ -138,6 +139,7 @@ bool profile_batched_gemm_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;

    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -203,6 +205,12 @@ bool profile_batched_gemm_impl(int do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
            // re-init C to zero before profiling next kernel
            c_device_buf.SetZero();

@@ -259,6 +267,11 @@ bool profile_batched_gemm_impl(int do_verification,
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -40,19 +40,19 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
                                            int N,
                                            int K,
                                            int O,
-                                            int BatchCount    = 1,
-                                            int StrideA       = -1,
-                                            int StrideB0      = -1,
-                                            int StrideB1      = -1,
-                                            int StrideC       = -1,
-                                            int BatchStrideA  = -1,
-                                            int BatchStrideB0 = -1,
-                                            int BatchStrideB1 = -1,
-                                            int BatchStrideC  = -1,
-                                            float alpha       = -1.f)
+                                            int BatchCount     = 1,
+                                            int StrideA        = -1,
+                                            int StrideB0       = -1,
+                                            int StrideB1       = -1,
+                                            int StrideC        = -1,
+                                            int BatchStrideA   = -1,
+                                            int BatchStrideB0  = -1,
+                                            int BatchStrideB1  = -1,
+                                            int BatchStrideC   = -1,
+                                            float alpha        = -1.f,
+                                            int instance_index = -1)

 {
-
    using Row           = tensor_layout::gemm::RowMajor;
    using Col           = tensor_layout::gemm::ColumnMajor;
    using PassThrough   = tensor_operation::element_wise::PassThrough;
@@ -253,7 +253,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
    {
@@ -285,6 +285,13 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
            std::string op_name = op_ptr->GetTypeString();

            float ave_time =
@@ -341,7 +348,11 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,

    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_softmax_gemm_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
@@ -48,10 +48,10 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
                                                    int O,
                                                    int G0,
                                                    int G1,
-                                                    float alpha = -1.f)
+                                                    float alpha        = -1.f,
+                                                    int instance_index = -1)

 {
-
    using PassThrough   = tensor_operation::element_wise::PassThrough;
    using Scale         = tensor_operation::element_wise::Scale;
    using AElementOp    = PassThrough;
@@ -254,6 +254,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;

    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -287,6 +288,13 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+
            std::string op_name = op_ptr->GetTypeString();

            float ave_time =
@@ -362,7 +370,11 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,

    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "batched_gemm_softmax_gemm_permute_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
@@ -34,7 +34,8 @@ bool profile_batchnorm_backward_impl(bool do_verification,
                                     const std::vector<size_t> inOutLengths,
                                     const std::vector<int> reduceDims,
                                     bool haveSavedMeanInvVar,
-                                     double epsilon)
+                                     double epsilon,
+                                     index_t instance_index = -1)
 {
    if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
    {
@@ -293,6 +294,11 @@ bool profile_batchnorm_backward_impl(bool do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -382,7 +388,11 @@ bool profile_batchnorm_backward_impl(bool do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if (instance_index != -1)
+    {
+        std::cout << "batchnorm_backward_instance (" << instance_index << "/" << num_kernel
+            << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
@@ -35,7 +35,8 @@ bool profile_batchnorm_forward_impl(int do_verification,
                                    bool updateMovingAverage,
                                    bool saveMeanAndInvVariance,
                                    double averageFactor,
-                                    double epsilon)
+                                    double epsilon,
+                                    index_t instance_index = -1)
 {
    if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
    {
@@ -287,6 +288,11 @@ bool profile_batchnorm_forward_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -404,7 +410,11 @@ bool profile_batchnorm_forward_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "batchnorm_forward_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
@@ -32,7 +32,8 @@ bool profile_batchnorm_infer_impl(int do_verification,
                                  bool time_kernel,
                                  const std::vector<size_t> inOutLengths,
                                  const std::vector<int> reduceDims,
-                                  double epsilon)
+                                  double epsilon,
+                                  index_t instance_index = -1)
 {
    if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
    {
@@ -253,6 +254,11 @@ bool profile_batchnorm_infer_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -327,7 +333,11 @@ bool profile_batchnorm_infer_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if (instance_index != -1)
+    {
+        std::cout << "batchnorm_infer_instance (" << instance_index << "/" << num_kernel
+            << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_contraction_impl.hpp
+++ b/profiler/include/profiler/profile_contraction_impl.hpp
@@ -54,7 +54,8 @@ int profile_contraction_impl(ck::index_t do_verification,
                             const std::vector<ck::index_t>& StridesA, // [M0, M1, K0, K1]
                             const std::vector<ck::index_t>& StridesB, // [N0, N1, K0, K1]
                             const std::vector<ck::index_t>& StridesE, // [M0, M1, N0, N1]
-                             const std::vector<ck::index_t>& StridesD) // [M0, M1, N0, N1]
+                             const std::vector<ck::index_t>& StridesD, // [M0, M1, N0, N1]
+                             int instance_index = -1)
 {
    bool pass = true;

@@ -197,7 +198,7 @@ int profile_contraction_impl(ck::index_t do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
    {
@@ -256,6 +257,12 @@ int profile_contraction_impl(ck::index_t do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
            // re-init C to zero before profiling next kernel
            e_device_buf.SetZero();

@@ -376,6 +383,11 @@ int profile_contraction_impl(ck::index_t do_verification,
              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
              << best_op_name << std::endl;

+    if(instance_index != -1)
+    {
+        std::cout << "contraction_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -58,7 +58,8 @@ bool profile_conv_bwd_data_impl(int do_verification,
                                int init_method,
                                bool do_log,
                                bool time_kernel,
-                                const ck::utils::conv::ConvParam& conv_param)
+                                const ck::utils::conv::ConvParam& conv_param,
+                                int instance_index = -1)
 {
    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -174,7 +175,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
    // profile device Conv instances
    bool pass = true;

@@ -200,6 +201,12 @@ bool profile_conv_bwd_data_impl(int do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
            // for conv bwd data, some input tensor element are zero, but not written by kernel,
            // need to set zero
            in_device_buf.SetZero();
@@ -263,7 +270,11 @@ bool profile_conv_bwd_data_impl(int do_verification,
    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
              << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "conv_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
@@ -36,7 +36,8 @@ bool profile_conv_fwd_impl(int do_verification,
                           int init_method,
                           bool do_log,
                           bool time_kernel,
-                           const ck::utils::conv::ConvParam& conv_param)
+                           const ck::utils::conv::ConvParam& conv_param,
+                           int instance_index = -1)
 {
    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -156,7 +157,7 @@ bool profile_conv_fwd_impl(int do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
    // profile device op instances
    bool pass = true;

@@ -182,6 +183,12 @@ bool profile_conv_fwd_impl(int do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
            // re-init output to zero before profiling next kernel
            out_device_buf.SetZero();

@@ -236,7 +243,11 @@ bool profile_conv_fwd_impl(int do_verification,
    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
              << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "conv_fwd_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
+++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
@@ -122,7 +122,8 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
                                        int init_method,
                                        bool do_log,
                                        bool time_kernel,
-                                        const ck::utils::conv::ConvParam& conv_param)
+                                        const ck::utils::conv::ConvParam& conv_param,
+                                        index_t instance_index = -1)
 {
    const ck::index_t NDoHoWo =
        conv_param.N_ *
@@ -226,7 +227,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
    // profile device op instances
    bool pass                   = true;
    bool is_supporting_instance = false;
-
+    index_t num_kernel          = 0;
    for(auto& op_ptr : op_ptrs)
    {
        auto argument_ptr = op_ptr->MakeArgumentPointer(
@@ -247,6 +248,12 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
            is_supporting_instance = true;
            // re-init output to zero before profiling next kernel
            out_device_buf.SetZero();
@@ -291,6 +298,11 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
              << "\navg_time: " << best_avg_time << "\nGB/s: " << best_gb_per_sec << std::endl;

+    if(instance_index != -1)
+    {
+        std::cout << "conv_tensor_rearrange_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return is_supporting_instance && pass;
 }

--- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
@@ -49,7 +49,8 @@ bool profile_elementwise_layernorm_impl(int do_verification,
                                        int init_method,
                                        bool do_log,
                                        bool time_kernel,
-                                        std::vector<index_t> length)
+                                        std::vector<index_t> length,
+                                        index_t instance_index = -1)
 {
    using Add         = ck::tensor_operation::element_wise::Add;
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -199,6 +200,11 @@ bool profile_elementwise_layernorm_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -270,6 +276,11 @@ bool profile_elementwise_layernorm_impl(int do_verification,
        return false;
    }

+    if(instance_index != -1)
+    {
+        std::cout << "elementwise_layernorm_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return true;
 }

--- a/profiler/include/profiler/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
@@ -70,7 +70,8 @@ bool profile_gemm_reduce_impl(int do_verification,
                              int K,
                              int StrideA,
                              int StrideB,
-                              int StrideC)
+                              int StrideC,
+                              int instance_index = -1)
 {
    bool pass = true;

@@ -249,7 +250,7 @@ bool profile_gemm_reduce_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
    // profile device GEMM instances
    for(auto& gemm_ptr : gemm_ptrs)
    {
@@ -275,6 +276,12 @@ bool profile_gemm_reduce_impl(int do_verification,

        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
            // init DO, D1 to 0
            reduce0_device_buf.SetZero();
            reduce1_device_buf.SetZero();
@@ -345,7 +352,11 @@ bool profile_gemm_reduce_impl(int do_verification,

    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "gemm_reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -44,7 +44,8 @@ bool profile_gemm_splitk_impl(int do_verification,
                              int StrideC,
                              int KBatch,
                              int n_warmup,
-                              int n_iter)
+                              int n_iter,
+                              int instance_index = -1)
 {
    bool pass = true;

@@ -141,6 +142,7 @@ bool profile_gemm_splitk_impl(int do_verification,
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
    float best_kbatch     = 0;
+    int num_kernel        = 0;

    // profile device GEMM instances
    for(auto& op_ptr : op_ptrs)
@@ -175,7 +177,12 @@ bool profile_gemm_splitk_impl(int do_verification,

            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
            {
-
+                ++num_kernel;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
                // re-init C to zero before profiling next kernel
                c_device_buf.SetZero();

@@ -294,7 +301,11 @@ bool profile_gemm_splitk_impl(int do_verification,
              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
              << " GB/s, " << best_op_name << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "gemm_splitk_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -35,7 +35,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                        bool do_log,
                                        bool time_kernel,
                                        const ck::utils::conv::ConvParam& conv_param,
-                                        ck::index_t split_k = 1)
+                                        ck::index_t split_k    = 1,
+                                        index_t instance_index = -1)
 {
    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -123,9 +124,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
    ck::index_t best_split_k = 1;

    // profile device op instances
-    bool pass = true;
-
-    auto run_impl = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
+    bool pass          = true;
+    index_t num_kernel = 0;
+    auto run_impl      = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
        // workspace_sz will be equal to 0 for other layout than NGCHW
        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
        DeviceMem workspace_dev(workspace_sz);
@@ -133,6 +134,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                return;
+            }
            std::string op_name = op_ptr->GetTypeString();

            auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -165,8 +172,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                in_device_buf.FromDevice(in_device.mData.data());

                using ComputeType = std::conditional_t<sizeof(OutDataType) < sizeof(WeiDataType),
-                                                       OutDataType,
-                                                       WeiDataType>;
+                                                            OutDataType,
+                                                            WeiDataType>;
                using AccDataType =
                    std::conditional_t<std::is_same_v<ComputeType, int8_t>, int32_t, float>;
                const index_t num_accums = conv_param.K_;
@@ -297,6 +304,11 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
              << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl;

+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -41,7 +41,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                                          bool do_log,
                                          bool time_kernel,
                                          const ck::utils::conv::ConvParam& conv_param,
-                                          const std::string& split_k)
+                                          const std::string& split_k,
+                                          index_t instance_index = -1)
 {
    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -187,6 +188,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
        }
    }

+    index_t num_kernel = 0;
    for(auto& op_ptr : op_ptrs)
    {
        for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
@@ -226,6 +228,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,

            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
            {
+                num_kernel++;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }

                std::string op_name = op_ptr->GetTypeString();

@@ -326,6 +334,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
              << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl;

+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_bwd_weight_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return all_pass;
 }

--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
@@ -126,7 +126,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                              int init_method,
                                              bool do_log,
                                              bool time_kernel,
-                                              const ck::utils::conv::ConvParam& conv_param)
+                                              const ck::utils::conv::ConvParam& conv_param,
+                                              int instance_index = -1)
 {
    const float floor   = 0.f;
    const float ceil    = 2048.f;
@@ -295,6 +296,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;

    // profile device op instances
    bool pass = true;
@@ -307,6 +309,13 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                std::cout << op_ptr->GetTypeString() << " skipped" << std::endl;
+                return;
+            }
            // re-init output to zero before profiling next kernel
            out_device_buf.SetZero();

@@ -420,7 +429,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
              << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_bias_bnorm_clamp_instance (" << instance_index << "/"
+                  << num_kernel << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
@@ -64,7 +64,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                              int init_method,
                                              bool do_log,
                                              bool time_kernel,
-                                              const ck::utils::conv::ConvParam& conv_param)
+                                              const ck::utils::conv::ConvParam& conv_param,
+                                              int instance_index = -1)
 {
    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -194,7 +195,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
-
+    int num_kernel        = 0;
    // profile device op instances
    bool pass = true;

@@ -206,6 +207,13 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                std::cout << op_ptr->GetTypeString() << " skipped" << std::endl;
+                return;
+            }
            // re-init output to zero before profiling next kernel
            out_device_buf.SetZero();

@@ -317,7 +325,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
              << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_bias_clamp_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -42,7 +42,8 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
                                   bool do_log,
                                   bool time_kernel,
                                   const ck::utils::conv::ConvParam& conv_param,
-                                   const OutElementOp out_element_op = OutElementOp{})
+                                   const OutElementOp out_element_op = OutElementOp{},
+                                   index_t instance_index            = -1)
 {
    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -144,7 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
-
+    index_t num_kernel    = 0;
    // profile device op instances
    bool pass = true;

@@ -156,6 +157,13 @@ bool profile_grouped_conv_fwd_impl(int do_verification,

        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                return;
+            }
+
            std::string op_name = op_ptr->GetTypeString();

            auto invoker_ptr = op_ptr->MakeInvokerPointer();
@@ -253,7 +261,11 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
              << "\nGB/s: " << best_gb_per_sec << std::endl;
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_conv_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -44,7 +44,8 @@ bool profile_grouped_gemm_impl(int do_verification,
                               const std::vector<int>& StrideCs,
                               const std::vector<int>& kbatches = {},
                               int n_warmup                     = 1,
-                               int n_iter                       = 10)
+                               int n_iter                       = 10,
+                               int instance_index               = -1)
 {
    bool pass = true;
    // TODO: Fixme - we do not pass compute data type here but need it
@@ -195,8 +196,8 @@ bool profile_grouped_gemm_impl(int do_verification,
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
    float best_kbatch     = 0;
-
-    auto p_ds = std::vector<std::array<const void*, 0>>{};
+    int num_kernel        = 0;
+    auto p_ds             = std::vector<std::array<const void*, 0>>{};

    if(do_verification)
    {
@@ -279,6 +280,13 @@ bool profile_grouped_gemm_impl(int do_verification,

            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
            {
+                ++num_kernel;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
+
                for(std::size_t i = 0; i < gemm_descs.size(); i++)
                    c_device_buf[i]->SetZero();

@@ -371,7 +379,11 @@ bool profile_grouped_gemm_impl(int do_verification,
                  << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch
                  << std::endl;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "grouped_gemm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp
@@ -26,7 +26,8 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
                                     int init_method,
                                     bool do_log,
                                     bool time_kernel,
-                                     std::vector<index_t> length)
+                                     std::vector<index_t> length,
+                                     index_t instance_index = -1)
 {
    // we don't need DGamma and DBeta here, just for reference class
    using DGammaDataType = DXDataType;
@@ -162,6 +163,11 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -242,7 +248,11 @@ bool profile_groupnorm_bwd_data_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "groupnorm_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return true;
 }

--- a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp
@@ -29,7 +29,8 @@ bool profile_groupnorm_impl(int do_verification,
                            int init_method,
                            bool do_log,
                            bool time_kernel,
-                            std::vector<index_t> length)
+                            std::vector<index_t> length,
+                            index_t instance_index = -1)
 {
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -178,6 +179,11 @@ bool profile_groupnorm_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -267,6 +273,12 @@ bool profile_groupnorm_impl(int do_verification,
        return false;
    }

+    if(instance_index != -1)
+    {
+        std::cout << "groupnorm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
+
    return true;
 }

--- a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp
@@ -27,7 +27,8 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
                                     int init_method,
                                     bool do_log,
                                     bool time_kernel,
-                                     std::vector<index_t> length)
+                                     std::vector<index_t> length,
+                                     index_t instance_index = -1)
 {
    // we don't need DGamma and DBeta here, just for reference class
    using DGammaDataType = DXDataType;
@@ -167,6 +168,11 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -247,7 +253,11 @@ bool profile_layernorm_bwd_data_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_bwd_data_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return true;
 }

--- a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
@@ -27,7 +27,8 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
                                           int init_method,
                                           bool do_log,
                                           bool time_kernel,
-                                           std::vector<index_t> length)
+                                           std::vector<index_t> length,
+                                           index_t instance_index = -1)
 {
    // we don't need GammaDataType and DXDataType here, just for reference class
    using GammaDataType = DYDataType;
@@ -178,6 +179,11 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -255,7 +261,11 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_bwd_gamma_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return true;
 }

--- a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp
@@ -28,7 +28,8 @@ bool profile_layernorm_impl(int do_verification,
                            int init_method,
                            bool do_log,
                            bool time_kernel,
-                            std::vector<index_t> length)
+                            std::vector<index_t> length,
+                            index_t instance_index = -1)
 {
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -188,6 +189,11 @@ bool profile_layernorm_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -286,6 +292,12 @@ bool profile_layernorm_impl(int do_verification,
        return false;
    }

+    if(instance_index != -1)
+    {
+        std::cout << "layernorm_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
+
    return true;
 }

--- a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp
@@ -34,7 +34,8 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
                                 std::vector<index_t> window_strides,
                                 std::vector<index_t> window_dilations,
                                 std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
    // AtomicAdd only support f32 for now. ComputeDataType must be float32
    using ComputeDataType = float;
@@ -199,6 +200,11 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
        {
            ++num_kernel;
            instance_found = true;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -289,7 +295,11 @@ bool profile_max_pool2d_bwd_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool2d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return pass && instance_found;
 }

--- a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp
@@ -34,7 +34,8 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
                                 std::vector<index_t> window_strides,
                                 std::vector<index_t> window_dilations,
                                 std::vector<index_t> input_left_pads,
-                                 std::vector<index_t> input_right_pads)
+                                 std::vector<index_t> input_right_pads,
+                                 index_t instance_index = -1)
 {
    // AtomicAdd only support f32 for now. ComputeDataType must be float32
    using ComputeDataType = float;
@@ -193,6 +194,11 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -281,7 +287,11 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool3d_bwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return true;
 }

--- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp
@@ -35,7 +35,8 @@ bool profile_pool2d_fwd_impl(int do_verification,
                             std::vector<index_t> window_strides,
                             std::vector<index_t> window_dilations,
                             std::vector<index_t> input_left_pads,
-                             std::vector<index_t> input_right_pads)
+                             std::vector<index_t> input_right_pads,
+                             index_t instance_index = -1)
 {
    constexpr index_t InOutRank  = 4;
    constexpr index_t WindowRank = 2;
@@ -171,6 +172,11 @@ bool profile_pool2d_fwd_impl(int do_verification,
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -268,7 +274,11 @@ bool profile_pool2d_fwd_impl(int do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool2d_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return true;
 }

--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -46,7 +46,9 @@ template <typename InDataType,
          ck::ReduceTensorOp ReduceOpId,
          bool PropagateNan,
          bool OutputIndex>
-bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params)
+bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params,
+                             PoolFwdKernelParams& kernel_params,
+                             index_t instance_index = -1)
 {
    constexpr index_t InOutRank  = 5;
    constexpr index_t WindowRank = 3;
@@ -199,6 +201,11 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            ++num_kernel;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
        }
        else
        {
@@ -328,7 +335,11 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
-
+    if(instance_index != -1)
+    {
+        std::cout << "max_pool3d_fwd_instance (" << instance_index << "/" << num_kernel
+                  << "): Passed" << std::endl;
+    }
    return true;
 }

--- a/profiler/include/profiler/profile_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_reduce_impl.hpp
@@ -144,7 +144,8 @@ bool profile_reduce_impl_impl(bool do_verification,
                              const std::vector<size_t>& inLengths,
                              const std::array<int, NumReduceDim>& reduceDims,
                              float alpha,
-                              float beta)
+                              float beta,
+                              index_t instance_index = -1)
 {
    using namespace ck::tensor_operation::device;
    using namespace ck::tensor_operation::device::instance;
@@ -373,7 +374,14 @@ bool profile_reduce_impl_impl(bool do_verification,
            if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
                continue;
            else
+            {
                num_kernel++;
+                if((instance_index != -1) && (instance_index + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
+            }

            std::string reduce_name = reduce_ptr->GetTypeString();

@@ -452,7 +460,11 @@ bool profile_reduce_impl_impl(bool do_verification,
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    };
-
+    if(instance_index != -1)
+    {
+        std::cout << "reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
    return pass;
 };

@@ -467,7 +479,8 @@ bool profile_reduce_impl(bool do_verification,
                         bool PropagateNan,
                         bool UseIndex,
                         float alpha,
-                         float beta)
+                         float beta,
+                         index_t instance_index = -1)
 {
    bool matched = false;
    bool pass    = true;
@@ -505,7 +518,8 @@ bool profile_reduce_impl(bool do_verification,
                                                                     inLengths,
                                                                     arrReduceDims,
                                                                     alpha,
-                                                                     beta);
+                                                                     beta,
+                                                                     instance_index);

        matched = true;
    });
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -53,7 +53,8 @@ bool profile_softmax_impl(int do_verification,
                          std::vector<index_t> in_strides,
                          std::vector<index_t> reduce_dims,
                          double alpha,
-                          double beta)
+                          double beta,
+                          index_t instance_index = -1)
 {
    if(Rank != in_length.size())
    {
@@ -124,7 +125,7 @@ bool profile_softmax_impl(int do_verification,
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
    std::vector<bool> instance_pass;
-
+    index_t num_kernel = 0;
    for(auto& inst_ptr : instances)
    {
        auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
@@ -146,6 +147,15 @@ bool profile_softmax_impl(int do_verification,
            instance_pass.push_back(true);
            continue;
        }
+        else
+        {
+            num_kernel++;
+            if((instance_index != -1) && (instance_index + 1 != num_kernel))
+            {
+                // skip test if instance_index is specified
+                continue;
+            }
+        }

        out_dev.ToDevice(prior_out.data());
        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
@@ -216,6 +226,11 @@ bool profile_softmax_impl(int do_verification,
        std::cout << "alpha = " << alpha << ", " << "beta = " << beta << ", " << best_avg_time
                  << " ms, " << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
    }
+    if(instance_index != -1)
+    {
+        std::cout << "reduce_instance (" << instance_index << "/" << num_kernel << "): Passed"
+                  << std::endl;
+    }
    return std::all_of(
        std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
 }
--- a/test/batched_gemm/test_batched_gemm_wmma.cpp
+++ b/test/batched_gemm/test_batched_gemm_wmma.cpp
@@ -12,7 +12,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"

 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
-
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 struct GemmParams
 {
    ck::index_t M;
@@ -37,96 +38,153 @@ class TestBatchedGemm : public ::testing::Test
        using namespace ck::tensor_operation::device;

        bool pass = true;
-        for(auto& param : params)
+        for(size_t i = 0; i < params.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param           = params[i];
            const auto M          = param.M;
            const auto N          = param.N;
            const auto K          = param.K;
            const auto BatchCount = param.BatchCount;

-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);

-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);

-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);

-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -191,3 +249,20 @@ TEST_F(TestBatchedGemm, fp16)
 //     this->template Run<float>();
 // }
 // #endif
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batched_gemm/test_batched_gemm_xdl.cpp
+++ b/test/batched_gemm/test_batched_gemm_xdl.cpp
@@ -13,6 +13,9 @@

 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 struct GemmParams
 {
    ck::index_t M;
@@ -37,96 +40,153 @@ class TestBatchedGemm : public ::testing::Test
        using namespace ck::tensor_operation::device;

        bool pass = true;
-        for(auto& param : params)
+        for(size_t i = 0; i < params.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param           = params[i];
            const auto M          = param.M;
            const auto N          = param.N;
            const auto K          = param.K;
            const auto BatchCount = param.BatchCount;

-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);

-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Row,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Row,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Row,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Row,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               K,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);

-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Row,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Row,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Row,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Row,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               N,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);

-            pass =
-                pass && ck::profiler::profile_batched_gemm_impl<DataType,
-                                                                DataType,
-                                                                DataType,
-                                                                Col,
-                                                                Col,
-                                                                Row,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                DeviceBatchedGemm<Col,
-                                                                                  Col,
-                                                                                  Row,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  DataType,
-                                                                                  PassThrough,
-                                                                                  PassThrough,
-                                                                                  PassThrough>>(
-                            true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+            pass = pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                   DataType,
+                                                                   DataType,
+                                                                   Col,
+                                                                   Col,
+                                                                   Row,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   PassThrough,
+                                                                   DeviceBatchedGemm<Col,
+                                                                                     Col,
+                                                                                     Row,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     DataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>>(
+                               true,
+                               1,
+                               false,
+                               1,
+                               M,
+                               N,
+                               K,
+                               M,
+                               K,
+                               N,
+                               M * K,
+                               K * N,
+                               M * N,
+                               BatchCount,
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -183,3 +243,20 @@ TEST_F(TestBatchedGemm, fp32)
    this->template Run<float>();
 }
 #endif
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp
@@ -7,6 +7,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"

+static ck::index_t instance_index = -1;
+
 namespace {
 using F16 = ck::half_t;

@@ -70,7 +72,8 @@ class TestBatchedGemmMultiD : public ::testing::Test
                M * K,
                K * N,
                M * N,
-                BatchCount);
+                BatchCount,
+                instance_index);
        EXPECT_TRUE(pass);
    }
 };
@@ -88,3 +91,18 @@ TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
 #ifdef CK_ENABLE_INT8
 TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
 #endif
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp
@@ -4,6 +4,9 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_util.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchedGemmSoftmaxGemmFP16 : public TestBatchedGemmSoftmaxGemm<Tuple>
 {
@@ -174,3 +177,20 @@ TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, AdhocTest)
    };
    this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -9,6 +9,9 @@
 #include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;

+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 template <ck::index_t N>
 using I = ck::Number<N>;

@@ -57,15 +60,38 @@ struct TestBatchedGemmSoftmaxGemm : public ::testing::Test
                                                                         B1Layout,
                                                                         CLayout,
                                                                         MaskingType::value>(
-            verify_, 1, false, bench_, M, N, K, O, BatchCount);
+            verify_,
+            1,
+            false,
+            bench_,
+            M,
+            N,
+            K,
+            O,
+            BatchCount,
+            -1, //  StrideA
+            -1, // StrideB0
+            -1, // StrideB1
+            -1, // StrideC
+            -1, // BatchStrideA
+            -1, //  BatchStrideB0
+            -1, // BatchStrideB1
+            -1, // BatchStrideC
+            -1, // alpha
+            instance_index);

        EXPECT_TRUE(pass);
    }

    void Run()
    {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths  = this->lengths_[i];
            int M          = lengths[0];
            int N          = lengths[1];
            int K          = lengths[2];
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp
@@ -4,6 +4,8 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_bias_softmax_gemm_permute_util.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -180,3 +182,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
    };
    this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp
@@ -4,6 +4,8 @@
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -180,3 +182,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest)
    };
    this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
@@ -10,7 +10,8 @@
 #include "profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp"

 #include <hip/hip_runtime.h>
-
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
 using ck::tensor_operation::device::TensorSpecialization;
@@ -66,21 +67,26 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
                                                                              Acc0BiasDataType,
                                                                              Acc1BiasDataType,
                                                                              MaskingType::value>(
-                verify_, 2, false, bench_, M, N, K, O, G0, G1);
+                verify_, 2, false, bench_, M, N, K, O, G0, G1, -1, instance_index);

        EXPECT_TRUE(pass);
    }

    void Run()
    {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
        {
-            int M  = lengths[0];
-            int N  = lengths[1];
-            int K  = lengths[2];
-            int O  = lengths[3];
-            int G0 = lengths[4];
-            int G1 = lengths[5];
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths = this->lengths_[i];
+            int M         = lengths[0];
+            int N         = lengths[1];
+            int K         = lengths[2];
+            int O         = lengths[3];
+            int G0        = lengths[4];
+            int G1        = lengths[5];

            this->RunSingle(M, N, K, O, G0, G1);
        }
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
@@ -5,6 +5,8 @@
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
 #include "test_batched_gemm_device_utils.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -228,3 +230,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
    };
    this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
@@ -5,6 +5,9 @@
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
 #include "test_batched_gemm_device_utils.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
@@ -191,3 +194,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest)
    };
    this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -9,6 +9,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"

+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
 using ck::tensor_operation::device::TensorSpecialization;
@@ -64,21 +66,26 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
                                                                         ck::Tuple<>,
                                                                         ck::Tuple<>,
                                                                         MaskingType::value>(
-                verify_, 2, false, bench_, M, N, K, O, G0, G1);
+                verify_, 2, false, bench_, M, N, K, O, G0, G1, -1, instance_index);

        EXPECT_TRUE(pass);
    }

    void Run()
    {
-        for(auto lengths : this->lengths_)
+        for(size_t i = 0; i < this->lengths_.size(); i++)
        {
-            int M  = lengths[0];
-            int N  = lengths[1];
-            int K  = lengths[2];
-            int O  = lengths[3];
-            int G0 = lengths[4];
-            int G1 = lengths[5];
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& lengths = this->lengths_[i];
+            int M         = lengths[0];
+            int N         = lengths[1];
+            int K         = lengths[2];
+            int O         = lengths[3];
+            int G0        = lengths[4];
+            int G1        = lengths[5];

            this->RunSingle(M, N, K, O, G0, G1);
        }
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -15,6 +15,9 @@ using F32  = float;
 using BF16 = ck::bhalf_t;
 using F64  = double;

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchNormBwdRank4 : public ::testing::Test
 {
@@ -37,33 +40,48 @@ class TestBatchNormBwdRank4 : public ::testing::Test
    template <int NumReduceDim>
    void Run()
    {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
        {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;

            EXPECT_FALSE(reduceDims.size() != NumReduceDim);

-            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
-                                                                         DxDataType,
-                                                                         DyDataType,
-                                                                         AccDataType,
-                                                                         ScaleDataType,
-                                                                         BiasDataType,
-                                                                         MeanVarDataType,
-                                                                         4,
-                                                                         NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, true, epsilon);
+            pass =
+                pass &&
+                ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                              DxDataType,
+                                                              DyDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                    true, 3, false, false, inOutLengths, reduceDims, true, epsilon, instance_index);

-            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
-                                                                         DxDataType,
-                                                                         DyDataType,
-                                                                         AccDataType,
-                                                                         ScaleDataType,
-                                                                         BiasDataType,
-                                                                         MeanVarDataType,
-                                                                         4,
-                                                                         NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, false, epsilon);
+            pass =
+                pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                      DxDataType,
+                                                                      DyDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      4,
+                                                                      NumReduceDim>(true,
+                                                                                    3,
+                                                                                    false,
+                                                                                    false,
+                                                                                    inOutLengths,
+                                                                                    reduceDims,
+                                                                                    false,
+                                                                                    epsilon,
+                                                                                    instance_index);

            EXPECT_TRUE(pass);
        }
@@ -103,3 +121,19 @@ TYPED_TEST(TestBatchNormBwdRank4, nchw)
    this->reduceDims = {0, 2, 3};
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -16,6 +16,9 @@ using BF16 = ck::bhalf_t;
 using I8   = int8_t;
 using F64  = double;

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestBatchNormFwdRank4 : public ::testing::Test
 {
@@ -38,9 +41,14 @@ class TestBatchNormFwdRank4 : public ::testing::Test
    template <int NumReduceDim>
    void Run()
    {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
        {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;

            EXPECT_FALSE(reduceDims.size() != NumReduceDim);

@@ -61,7 +69,8 @@ class TestBatchNormFwdRank4 : public ::testing::Test
                                                                                   true,
                                                                                   true,
                                                                                   epsilon,
-                                                                                   averageFactor);
+                                                                                   averageFactor,
+                                                                                   instance_index);

            pass =
                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
@@ -80,7 +89,8 @@ class TestBatchNormFwdRank4 : public ::testing::Test
                                                                                   false,
                                                                                   false,
                                                                                   epsilon,
-                                                                                   averageFactor);
+                                                                                   averageFactor,
+                                                                                   instance_index);

            EXPECT_TRUE(pass);
        }
@@ -120,3 +130,19 @@ TYPED_TEST(TestBatchNormFwdRank4, nchw)
    this->reduceDims = {0, 2, 3};
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/batchnorm/batchnorm_infer_rank_4.cpp
+++ b/test/batchnorm/batchnorm_infer_rank_4.cpp
@@ -10,6 +10,9 @@

 #include "profiler/profile_batchnorm_infer_impl.hpp"

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 using F16  = ck::half_t;
 using F32  = float;
 using BF16 = ck::bhalf_t;
@@ -36,31 +39,38 @@ class TestBatchNormInferRank4 : public ::testing::Test
    template <int NumReduceDim>
    void Run()
    {
-        for(auto& inOutLengths : list_of_lengths)
+        for(size_t i = 0; i < list_of_lengths.size(); i++)
        {
-            bool pass = true;
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& inOutLengths = list_of_lengths[i];
+            bool pass          = true;

            EXPECT_FALSE(reduceDims.size() != NumReduceDim);

-            pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
-                                                                      YDataType,
-                                                                      AccDataType,
-                                                                      ScaleDataType,
-                                                                      BiasDataType,
-                                                                      MeanVarDataType,
-                                                                      4,
-                                                                      NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, epsilon);
+            pass = pass &&
+                   ck::profiler::profile_batchnorm_infer_impl<XDataType,
+                                                              YDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                       true, 3, false, false, inOutLengths, reduceDims, epsilon, instance_index);

-            pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
-                                                                      YDataType,
-                                                                      AccDataType,
-                                                                      ScaleDataType,
-                                                                      BiasDataType,
-                                                                      MeanVarDataType,
-                                                                      4,
-                                                                      NumReduceDim>(
-                               true, 3, false, false, inOutLengths, reduceDims, epsilon);
+            pass = pass &&
+                   ck::profiler::profile_batchnorm_infer_impl<XDataType,
+                                                              YDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              4,
+                                                              NumReduceDim>(
+                       true, 3, false, false, inOutLengths, reduceDims, epsilon, instance_index);

            EXPECT_TRUE(pass);
        }
@@ -100,3 +110,20 @@ TYPED_TEST(TestBatchNormInferRank4, nchw)
    this->reduceDims = {0, 2, 3};
    this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/contraction/test_contraction_xdl.cpp
+++ b/test/contraction/test_contraction_xdl.cpp
@@ -12,10 +12,11 @@
 #include "profiler/profile_contraction_impl.hpp"
 #include "profiler/profile_contraction_utils.hpp"

-using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
-using F64  = double;
+static ck::index_t instance_index = -1;
+using F16                         = ck::half_t;
+using BF16                        = ck::bhalf_t;
+using F32                         = float;
+using F64                         = double;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -95,7 +96,8 @@ class TestContraction : public ::testing::Test
                                                                    StridesA,
                                                                    StridesB,
                                                                    StridesC,
-                                                                    StridesD);
+                                                                    StridesD,
+                                                                    instance_index);
            EXPECT_TRUE(pass);
        }
    }
@@ -219,3 +221,18 @@ TYPED_TEST(TestContractionScaleMixedPrecision, scale)
    this->template Run<2>({{8, 16}, {1, 1}, {8, 16}});
    this->template Run<2>({{1, 1}, {1, 1}, {1, 1}});
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
+++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
@@ -11,6 +11,9 @@

 #include "profiler/profile_conv_tensor_rearrange_impl.hpp"

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestConvTensorRearrange : public ::testing::Test
 {
@@ -25,18 +28,24 @@ class TestConvTensorRearrange : public ::testing::Test
    {
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
-            pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
-                                                                            ImLayout,
-                                                                            InDataType,
-                                                                            OutDataType,
-                                                                            ConvTensorRearrangeOp>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
+                                                                                   ImLayout,
+                                                                                   InDataType,
+                                                                                   OutDataType,
+                                                                                   ConvTensorRearrangeOp>(
                               true,  // do_verification
                               1,     // init_method: integer value
                               false, // do_log
                               false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -157,3 +166,19 @@ TYPED_TEST(TestConvTensorRearrange3d, Test3D)
    this->template Run<3, int8_t, int8_t>();
 #endif
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp
@@ -9,7 +9,8 @@
 #include <gtest/gtest.h>

 #include "profiler/profile_conv_bwd_data_impl.hpp"
-
+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestConvndBwdData : public ::testing::Test
 {
@@ -20,10 +21,15 @@ class TestConvndBwdData : public ::testing::Test
    template <ck::index_t NDimSpatial>
    void Run()
    {
-        for(auto& param : conv_params)
+        EXPECT_FALSE(conv_params.empty());
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
            bool pass;
-            EXPECT_FALSE(conv_params.empty());
            pass = ck::profiler::profile_conv_bwd_data_impl<
                NDimSpatial,
                ck::tuple_element_t<NDimSpatial - 1,
@@ -44,7 +50,8 @@ class TestConvndBwdData : public ::testing::Test
                          1,     // init_method integer value
                          false, // do_log
                          false, // time_kernel
-                          param);
+                          param,
+                          instance_index);
            EXPECT_TRUE(pass);
        }
    }
@@ -91,3 +98,19 @@ TYPED_TEST(TestConvndBwdData, Conv3dBwdData)
        {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/convnd_fwd/convnd_fwd_xdl.cpp
+++ b/test/convnd_fwd/convnd_fwd_xdl.cpp
@@ -10,6 +10,8 @@

 #include "profiler/profile_conv_fwd_impl.hpp"

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestConvndFwd : public ::testing::Test
 {
@@ -20,10 +22,15 @@ class TestConvndFwd : public ::testing::Test
    template <ck::index_t NDimSpatial>
    void Run()
    {
-        for(auto& param : conv_params)
+        EXPECT_FALSE(conv_params.empty());
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
            bool pass;
-            EXPECT_FALSE(conv_params.empty());
            pass = ck::profiler::profile_conv_fwd_impl<
                NDimSpatial,
                ck::tuple_element_t<NDimSpatial - 1,
@@ -44,7 +51,8 @@ class TestConvndFwd : public ::testing::Test
                          1,     // init_method integer value
                          false, // do_log
                          false, // time_kernel
-                          param);
+                          param,
+                          instance_index);
            EXPECT_TRUE(pass);
        }
    }
@@ -90,3 +98,19 @@ TYPED_TEST(TestConvndFwd, Conv3dFwd)
        {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestElementwiseLayernorm : public ::testing::Test
 {
@@ -25,15 +28,20 @@ class TestElementwiseLayernorm : public ::testing::Test
        std::vector<std::vector<ck::index_t>> lengths = {
            {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}, {4096, 8192}};

-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& length = lengths[i];
            bool success = ck::profiler::profile_elementwise_layernorm_impl<ADataType,
                                                                            BDataType,
                                                                            GammaDataType,
                                                                            BetaDataType,
                                                                            AccDataType,
                                                                            YDataType>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
            EXPECT_TRUE(success);
        }
    }
@@ -45,3 +53,19 @@ using KernelTypes = ::testing::Types<

 TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes);
 TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm/gemm_bf16.cpp
+++ b/test/gemm/gemm_bf16.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;

 #include "run_gemm_test.inc"

-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
--- a/test/gemm/gemm_fp16.cpp
+++ b/test/gemm/gemm_fp16.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;

 #include "run_gemm_test.inc"

-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -31,4 +31,4 @@ using AccDataType = float;

 #include "run_gemm_test.inc"

-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
--- a/test/gemm/gemm_fp64.cpp
+++ b/test/gemm/gemm_fp64.cpp
@@ -31,4 +31,4 @@ using AccDataType = double;

 #include "run_gemm_test.inc"

-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
@@ -31,4 +31,4 @@ using AccDataType = int32_t;

 #include "run_gemm_test.inc"

-int main() { return run_gemm_test(); }
+int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); }
--- a/test/gemm/gemm_standalone_xdl_fp16.cpp
+++ b/test/gemm/gemm_standalone_xdl_fp16.cpp
@@ -105,6 +105,7 @@ int main(int argc, char* argv[])

    bool do_verification = true;
    bool time_kernel     = true;
+    int problem_index    = -1;

    if(argc == 1)
    {
@@ -115,16 +116,28 @@ int main(int argc, char* argv[])
        do_verification = std::stoi(argv[1]);
        time_kernel     = std::stoi(argv[2]);
    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+        problem_index   = std::stoi(argv[3]);
+    }
    else
    {
        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
-                  << "arg2: time kernel (0=no, 1=yes)" << std::endl;
+                  << "arg2: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg3: problem index (0-35, -1 means all)" << std::endl;
        return 0;
    }

    bool pass = true;
-    for(auto& p : problems)
+    for(size_t i = 0; i < problems.size(); i++)
    {
+        if(problem_index != -1 && problem_index != static_cast<ck::index_t>(i))
+        {
+            continue;
+        }
+        auto& p                           = problems[i];
        GemmParams& problem_size          = std::get<0>(p);
        const LayoutConfig& layout_config = std::get<1>(p);
        const auto& factory               = std::get<2>(p);
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -261,6 +261,44 @@ struct TestGemm
            return true;
        }
    }
+
+    template <template <class...> class DeviceGemmPtr_,
+              typename ALayout,
+              typename BLayout,
+              typename CLayout,
+              typename ADataType,
+              typename BDataType,
+              typename CDataType,
+              typename AElementwiseOperation,
+              typename BElementwiseOperation,
+              typename CElementwiseOperation>
+    bool IsSupportedArgument(DeviceGemmPtr_<ALayout,
+                                            BLayout,
+                                            CLayout,
+                                            ADataType,
+                                            BDataType,
+                                            CDataType,
+                                            AElementwiseOperation,
+                                            BElementwiseOperation,
+                                            CElementwiseOperation>* gemmPtr,
+                             const GemmParams& params = GemmParams{})
+    {
+        auto invoker_ptr  = gemmPtr->MakeInvokerPointer();
+        auto argument_ptr = gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(nullptr),
+                                                         static_cast<BDataType*>(nullptr),
+                                                         static_cast<CDataType*>(nullptr),
+                                                         params.M,
+                                                         params.N,
+                                                         params.K,
+                                                         params.StrideA,
+                                                         params.StrideB,
+                                                         params.StrideC,
+                                                         AElementwiseOperation{},
+                                                         BElementwiseOperation{},
+                                                         CElementwiseOperation{});
+
+        return gemmPtr->IsSupportedArgument(argument_ptr.get());
+    }
 };

 } // namespace gemm_util
--- a/test/gemm/run_gemm_test.inc
+++ b/test/gemm/run_gemm_test.inc
@@ -1,13 +1,39 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

-int run_gemm_test()
+int run_gemm_test(int argc, char* argv[])
 {
    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;

    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    ck::gemm_util::GemmParams params;
+    ck::index_t instance_index = -1;
+    if(argc == 1)
+    {
+        // use default params
+    }
+    else if(argc == 4 || argc == 5)
+    {
+        params.M       = atoi(argv[1]);
+        params.N       = atoi(argv[2]);
+        params.K       = atoi(argv[3]);
+        params.StrideA = params.M;
+        params.StrideB = params.N;
+        params.StrideC = params.K;

+        if(argc == 5)
+        {
+            instance_index = atoi(argv[4]);
+        }
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1-4: M N K instance_index(-1 means all)" << std::endl;
+    }
+    std::cout << "Params (M, N, K, index) " << params.M << " " << params.N << " " << params.K << " "
+              << instance_index << std::endl;
    auto test = [&](auto a_layout, auto b_layout, auto c_layout) {
        bool pass = true;

@@ -24,10 +50,31 @@ int run_gemm_test()
        const auto gemmPtrs =
            ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
                DeviceOp>::GetInstances();
-
+        ck::index_t num_instance = 0;
        for(auto& gemmPtr : gemmPtrs)
        {
-            pass &= ck::gemm_util::TestGemm<AccDataType>{}(gemmPtr.get());
+            if(instance_index == -1)
+            {
+                pass &= ck::gemm_util::TestGemm<AccDataType>{}(gemmPtr.get(), params);
+            }
+            else
+            {
+                auto test_gemm = ck::gemm_util::TestGemm<AccDataType>{};
+                if(test_gemm.IsSupportedArgument(gemmPtr.get(), params))
+                {
+                    if(num_instance == instance_index)
+                    {
+                        pass &= test_gemm(gemmPtr.get(), params);
+                    }
+                    num_instance++;
+                }
+            }
+        }
+
+        if(instance_index != -1)
+        {
+            std::cout << "TestGemm_instance (" << instance_index << "/" << num_instance
+                      << "): " << (pass ? "Passed" : "Failed") << std::endl;
        }

        return pass;
--- a/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16_xdl.cpp
@@ -4,9 +4,20 @@
 #include <iostream>

 #include "profiler/profile_gemm_reduce_impl.hpp"
-
-int main()
+static ck::index_t instance_index = -1;
+int main(int argc, char** argv)
 {
+    if(argc == 1) {}
+    else if(argc == 2)
+    {
+        instance_index = atoi(argv[1]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1: instance_index(-1 means all)" << std::endl;
+    }
+
    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;

@@ -19,22 +30,22 @@ int main()
    pass = pass &&
           ck::profiler::
               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Row, Row>(
-                   true, 1, false, false, M, N, K, K, N, N);
+                   true, 1, false, false, M, N, K, K, N, N, instance_index);

    pass = pass &&
           ck::profiler::
               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Col, Row>(
-                   true, 1, false, false, M, N, K, K, K, N);
+                   true, 1, false, false, M, N, K, K, K, N, instance_index);

    pass = pass &&
           ck::profiler::
               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Row, Row>(
-                   true, 1, false, false, M, N, K, M, N, N);
+                   true, 1, false, false, M, N, K, M, N, N, instance_index);

    pass = pass &&
           ck::profiler::
               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Col, Row>(
-                   true, 1, false, false, M, N, K, M, K, N);
+                   true, 1, false, false, M, N, K, M, K, N, instance_index);

    if(pass)
    {
--- a/test/gemm_split_k/test_gemm_splitk_util.hpp
+++ b/test/gemm_split_k/test_gemm_splitk_util.hpp
@@ -15,6 +15,8 @@
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_splitk_impl.hpp"

+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 namespace ck {
 namespace test {

@@ -48,8 +50,13 @@ class TestGemmSplitK : public testing::Test
             const int StrideB,
             const int StrideC)
    {
-        for(auto kb : k_batches_)
+        for(size_t i = 0; i < k_batches_.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto kb = k_batches_[i];
            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
        }
    }
@@ -82,7 +89,8 @@ class TestGemmSplitK : public testing::Test
                                                                    StrideC,
                                                                    kbatch,
                                                                    n_warmup,
-                                                                    n_iter);
+                                                                    n_iter,
+                                                                    instance_index);
        EXPECT_TRUE(pass);
    }
 };
--- a/test/gemm_split_k/test_gemm_splitk_xdl.cpp
+++ b/test/gemm_split_k/test_gemm_splitk_xdl.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_splitk_util.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F16 = ck::half_t;
 using F32 = float;

@@ -64,3 +67,20 @@ TYPED_TEST_SUITE(TestGemmSplitK_KM_KN, KernelTypes);
 TYPED_TEST_SUITE(TestGemmSplitK_KM_NK, KernelTypes);

 #include "test_gemm_splitk_ut_cases.inc"
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal/test_gemm_universal_util.hpp
+++ b/test/gemm_universal/test_gemm_universal_util.hpp
@@ -14,7 +14,8 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_universal_impl.hpp"
-
+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
 namespace ck {
 namespace test {

@@ -49,8 +50,13 @@ class TestGemmUniversal : public testing::Test
             const int StrideB,
             const int StrideC)
    {
-        for(auto kb : k_batches_)
+        for(size_t i = 0; i < k_batches_.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto kb = k_batches_[i];
            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
        }
    }
@@ -84,7 +90,8 @@ class TestGemmUniversal : public testing::Test
                                                                       StrideC,
                                                                       kbatch,
                                                                       n_warmup,
-                                                                       n_iter);
+                                                                       n_iter,
+                                                                       instance_index);
        EXPECT_TRUE(pass);
    }
 };
--- a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
@@ -6,10 +6,11 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using I4   = ck::pk_i4_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using I4                   = ck::pk_i4_t;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -85,3 +86,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);

 #include "test_gemm_universal_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
@@ -6,10 +6,11 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using I4  = ck::pk_i4_t;
-using F8  = ck::f8_t;
-using F16 = ck::half_t;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using I4                   = ck::pk_i4_t;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;

 using F32 = float;

@@ -99,3 +100,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);

 #include "test_gemm_universal_ut_cases_fp16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
@@ -6,7 +6,8 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
 #if defined(CK_USE_WMMA_FP8)

 using F8   = ck::f8_t;
@@ -59,3 +60,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
 #include "test_gemm_universal_ut_cases_fp8.inc"

 #endif // CK_USE_WMMA_FP8
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
@@ -6,9 +6,10 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -80,3 +81,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);

 #include "test_gemm_universal_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
@@ -6,9 +6,10 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using F8  = ck::f8_t;
-using F16 = ck::half_t;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;

 using F32 = float;

@@ -92,3 +93,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);

 #include "test_gemm_universal_ut_cases_fp16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
@@ -6,11 +6,12 @@
 #include "gtest/gtest.h"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
-
-using F8   = ck::f8_t;
-using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+using F8                   = ck::f8_t;
+using F16                  = ck::half_t;
+using BF16                 = ck::bhalf_t;
+using F32                  = float;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -69,3 +70,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);


 #include "test_gemm_universal_ut_cases_fp8.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
@@ -15,6 +15,9 @@
 #include "include/ck/utility/data_type.hpp"
 #include "profiler/profile_gemm_universal_streamk_impl.hpp"

+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 namespace ck {
 namespace test {

@@ -56,8 +59,13 @@ class TestGemmUniversal_Streamk : public testing::Test
             const int StrideB,
             const int StrideC)
    {
-        for(auto streamk_sel : streamk_sel_list)
+        for(size_t i = 0; i < streamk_sel_list.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto streamk_sel = streamk_sel_list[i];
            RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, -1);
        }
    }
@@ -93,7 +101,8 @@ class TestGemmUniversal_Streamk : public testing::Test
                                                                               streamk_sel,
                                                                               Grid_size,
                                                                               n_warmup,
-                                                                               n_iter);
+                                                                               n_iter,
+                                                                               instance_index);
        EXPECT_TRUE(pass);
    }
 };
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using BF16 = ck::bhalf_t;
 using F32  = float;

@@ -83,3 +86,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_KN, KernelTypes_KM_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_NK, KernelTypes_KM_NK);

 #include "test_gemm_universal_streamk_ut_cases_bf16.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F8  = ck::f8_t;
 using F16 = ck::half_t;

@@ -82,3 +85,20 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_NK, KernelTypes_MK_NK);

 #include "test_gemm_universal_streamk_ut_cases_fp16.inc"
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
@@ -7,6 +7,9 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_streamk_util.hpp"

+ck::index_t param_mask     = 0xffff;
+ck::index_t instance_index = -1;
+
 using F8   = ck::f8_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
@@ -72,3 +75,19 @@ TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_NK, KernelTypes_MK_NK);

 #include "test_gemm_universal_streamk_ut_cases_fp8.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
@@ -11,6 +11,9 @@

 #include "profiler/profile_grouped_conv_bwd_data_impl.hpp"

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndBwdDataWmma : public ::testing::Test
 {
@@ -27,20 +30,27 @@ class TestGroupedConvndBwdDataWmma : public ::testing::Test
    {
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
-            pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
-                                                                            OutLayout,
-                                                                            WeiLayout,
-                                                                            InLayout,
-                                                                            DataType,
-                                                                            DataType,
-                                                                            DataType>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                   OutLayout,
+                                                                                   WeiLayout,
+                                                                                   InLayout,
+                                                                                   DataType,
+                                                                                   DataType,
+                                                                                   DataType>(
                               true,  // do_verification
                               1,     // init_method: integer value
                               false, // do_log
                               false, // time_kernel
-                               param);
+                               param,
+                               1, // splitK
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -106,3 +116,20 @@ TYPED_TEST(TestGroupedConvndBwdDataWmma3d, Test3D)
        {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -11,6 +11,9 @@

 #include "profiler/profile_grouped_conv_bwd_data_impl.hpp"

+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndBwdDataXdl : public ::testing::Test
 {
@@ -30,21 +33,27 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test
        bool pass = true;
        for(auto split_k : split_ks)
        {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
            {
-                pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
-                                                                                OutLayout,
-                                                                                WeiLayout,
-                                                                                InLayout,
-                                                                                DataType,
-                                                                                DataType,
-                                                                                DataType>(
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
+                pass        = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                       OutLayout,
+                                                                                       WeiLayout,
+                                                                                       InLayout,
+                                                                                       DataType,
+                                                                                       DataType,
+                                                                                       DataType>(
                                   true,  // do_verification
                                   1,     // init_method: integer value
                                   false, // do_log
                                   false, // time_kernel
                                   param,
-                                   split_k);
+                                   split_k,
+                                   instance_index);
            }
        }
        EXPECT_TRUE(pass);
@@ -149,3 +158,19 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
        {3, 1, 1, 1, 1, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
@@ -23,6 +23,8 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
 template <typename Tuple>
 class TestGroupedConvndBwdWeight : public ::testing::Test
 {
@@ -83,7 +85,8 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
    }

    bool PerformConvWeightBilinear(ck::utils::conv::ConvParam& conv_param,
-                                   const ck::index_t split_k)
+                                   const ck::index_t split_k,
+                                   ck::index_t instance_index_ = -1)
    {
        bool passed = true;

@@ -163,6 +166,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
        // get device op instances
        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
            DeviceOp>::GetInstances();
+        int num_kernel = 0;

        for(std::size_t i = 0; i < op_ptrs.size(); ++i)
        {
@@ -197,6 +201,12 @@ class TestGroupedConvndBwdWeight : public ::testing::Test

            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
            {
+                ++num_kernel;
+                if((instance_index_ != -1) && (instance_index_ + 1 != num_kernel))
+                {
+                    // skip test if instance_index is specified
+                    continue;
+                }
                float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr});
                wei_device_buf.FromDevice(wei_device.mData.data());
                passed &= ck::utils::check_err(wei_device, wei_host, "Error: incorrect results!");
@@ -218,6 +228,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                std::cerr << op_name << " does not support this problem" << std::endl;
            }
        }
+        if(instance_index != -1)
+        {
+            std::cout << "grouped_conv_bwd_weight_instance (" << instance_index << "/" << num_kernel
+                      << "): Passed" << std::endl;
+        }
        return passed;
    }

@@ -228,9 +243,14 @@ class TestGroupedConvndBwdWeight : public ::testing::Test

        for(auto split_k : split_ks)
        {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
            {
-                pass = pass && PerformConvWeightBilinear(param, split_k);
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
+                pass        = pass && PerformConvWeightBilinear(param, split_k, instance_index);
            }
        }
        EXPECT_TRUE(pass);
@@ -268,3 +288,20 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D)
        {3, 1, 1, 4, 4, {3, 3, 3}, {14, 28, 28}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -15,6 +15,9 @@

 #include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 using namespace ck::tensor_layout::convolution;

 template <typename Tuple>
@@ -92,8 +95,13 @@ class TestGroupedConvndBwdWeight : public ::testing::Test

        for(auto split_k : split_ks)
        {
-            for(auto& param : conv_params)
+            for(size_t i = 0; i < conv_params.size(); i++)
            {
+                if((param_mask & (1 << i)) == 0)
+                {
+                    continue;
+                }
+                auto& param = conv_params[i];
                if(!skip_case(split_k))
                {
                    pass = pass && ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial{},
@@ -108,7 +116,8 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                                       false, // do_log
                                       false, // time_kernel
                                       param,
-                                       std::to_string(split_k));
+                                       std::to_string(split_k),
+                                       instance_index);
                }
            }
        }
@@ -224,3 +233,20 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D)
        {3, 16, 16, 1, 1, {3, 3, 3}, {28, 28, 28}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->Run();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -9,6 +9,9 @@

 #include "profiler/profile_grouped_conv_fwd_impl.hpp"

+static ck::index_t param_mask     = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
 {
@@ -26,23 +29,30 @@ class TestGroupedConvndFwd : public ::testing::Test
    {
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
-            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                                       InLayout,
-                                                                       WeiLayout,
-                                                                       OutLayout,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       IndexType>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                              InLayout,
+                                                                              WeiLayout,
+                                                                              OutLayout,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              IndexType>(
                               true,  // do_verification
                               1,     // init_method: integer value
                               false, // do_log
                               false, // time_kernel
-                               param);
+                               param,
+                               ck::tensor_operation::element_wise::PassThrough{},
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -148,3 +158,20 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
        {3, 96, 1, 1, 1, {3, 3, 3}, {4, 30, 160}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
@@ -11,7 +11,9 @@

 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

-using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using BiasNormalizeInInferClamp   = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;

 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +32,13 @@ class TestGroupedConvndFwd : public ::testing::Test
    {
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -47,7 +54,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                               1,     // init_method: integer value
                               false, // do_log
                               false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -95,3 +103,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"

 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;

 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +31,13 @@ class TestGroupedConvndFwd : public ::testing::Test
    {
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -47,7 +53,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                               1,     // init_method: integer value
                               false, // do_log
                               false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -95,3 +102,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"

 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;

 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,8 +31,13 @@ class TestGroupedConvndFwd : public ::testing::Test
    {
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -47,7 +53,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                               1,     // init_method: integer value
                               false, // do_log
                               false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -133,3 +140,19 @@ TYPED_TEST(TestGroupedConvndFwdBiasClamp3d, Test3D)
                                 {1, 1, 1}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_impl.hpp"

 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using Clamp = ck::tensor_operation::element_wise::Clamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using Clamp                       = ck::tensor_operation::element_wise::Clamp;

 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -31,25 +32,31 @@ class TestGroupedConvndFwd : public ::testing::Test
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;
        Clamp out_element_op{0.f, 256.f};
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
-            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                                       InLayout,
-                                                                       WeiLayout,
-                                                                       OutLayout,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       DataType,
-                                                                       IndexType,
-                                                                       Clamp>(
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                              InLayout,
+                                                                              WeiLayout,
+                                                                              OutLayout,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              DataType,
+                                                                              IndexType,
+                                                                              Clamp>(
                               true,  // do_verification
                               1,     // init_method: integer value
                               false, // do_log
                               false, // time_kernel
                               param,
-                               out_element_op);
+                               out_element_op,
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -97,3 +104,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"

 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using BiasNormalizeInInferClamp   = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;

 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -30,9 +31,14 @@ class TestGroupedConvndFwd : public ::testing::Test
    {
        EXPECT_FALSE(conv_params.empty());
        bool pass = true;
-        for(auto& param : conv_params)
+        for(size_t i = 0; i < conv_params.size(); i++)
        {
-            pass = pass &&
+            if((param_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto& param = conv_params[i];
+            pass        = pass &&
                   ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
                                                                          InLayout,
                                                                          WeiLayout,
@@ -48,7 +54,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                       1,     // init_method: integer value
                       false, // do_log
                       false, // time_kernel
-                       param);
+                       param,
+                       instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -96,3 +103,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
@@ -10,8 +10,9 @@
 #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"

 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+static ck::index_t param_mask     = 0xffffff;
+static ck::index_t instance_index = -1;
+using AddClamp                    = ck::tensor_operation::element_wise::AddClamp;

 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -47,7 +48,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                               1,     // init_method: integer value
                               false, // do_log
                               false, // time_kernel
-                               param);
+                               param,
+                               instance_index);
        }
        EXPECT_TRUE(pass);
    }
@@ -95,3 +97,19 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_splitk_xdl.cpp
@@ -10,6 +10,9 @@
 #include "gtest/gtest.h"
 #include "test_grouped_gemm_util.hpp"

+ck::index_t param_mask     = 0xffffff;
+ck::index_t instance_index = -1;
+
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F8   = ck::f8_t;
@@ -42,3 +45,19 @@ using KernelTypes = ::testing::Types<
 TYPED_TEST_SUITE(TestGroupedGemm, KernelTypes);

 #include "test_grouped_gemm_ut_cases.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/grouped_gemm/test_grouped_gemm_util.hpp
@@ -23,6 +23,9 @@
 #include "ck/utility/number.hpp"
 #include "profiler/profile_grouped_gemm_impl.hpp"

+extern ck::index_t param_mask;
+extern ck::index_t instance_index;
+
 namespace ck {
 namespace test {

@@ -109,8 +112,16 @@ class TestGroupedGemm : public testing::Test
        {
            SetStrides<ELayout>(stride_cs, Ms, Ns);
        }
+        std::vector<int> k_batches;
+        for(size_t i = 0; i < k_batches_.size(); i++)
+        {
+            if(param_mask & (1 << i))
+            {
+                k_batches.push_back(k_batches_[i]);
+            }
+        }

-        RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches_);
+        RunSingle(Ms, Ns, Ks, stride_as, stride_bs, stride_cs, k_batches);
    }

    void RunSingle(const std::vector<int>& Ms,
@@ -139,7 +150,8 @@ class TestGroupedGemm : public testing::Test
                                                                     StrideCs,
                                                                     kbatches,
                                                                     n_warmup_,
-                                                                     n_iter_);
+                                                                     n_iter_,
+                                                                     instance_index);
        EXPECT_TRUE(pass);
    }
 };
--- a/test/magic_number_division/magic_number_division.cpp
+++ b/test/magic_number_division/magic_number_division.cpp
@@ -56,10 +56,27 @@ __host__ void cpu_magic_number_division(uint32_t magic_multiplier,
    }
 }

-int main(int, char*[])
+int main(int argc, char* argv[])
 {
-    uint64_t num_divisor  = 4096;
-    uint64_t num_dividend = 1L << 16;
+    uint64_t num_divisor   = 4096;
+    uint64_t num_dividend  = 1L << 16;
+    uint32_t divisor_start = 0;
+    uint32_t divisor_end   = num_divisor;
+
+    if(argc == 1)
+    {
+        // use default range
+    }
+    else if(argc == 3)
+    {
+        divisor_start = std::stoi(argv[1]);
+        divisor_end   = std::stoi(argv[2]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 2: divisor_start divisor_end" << std::endl;
+        return 1;
+    }

    std::vector<int32_t> divisors_host(num_divisor);
    std::vector<int32_t> dividends_host(num_dividend);
@@ -90,6 +107,10 @@ int main(int, char*[])

    for(std::size_t i = 0; i < num_divisor; ++i)
    {
+        if(i < divisor_start || i > divisor_end)
+        {
+            continue;
+        }
        // run naive division on GPU
        gpu_naive_division<<<1024, 256>>>(
            divisors_host[i],
--- a/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp
+++ b/test/normalization_bwd_data/test_groupnorm_bwd_data_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;

+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestgroupnormBwdData : public ::testing::Test
 {
@@ -29,15 +32,20 @@ class TestgroupnormBwdData : public ::testing::Test
                                                         {1, 32, 32, 32, 20},
                                                         {1, 16, 16, 32, 40}};

-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
        {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
            bool success = ck::profiler::profile_groupnorm_bwd_data_impl<DYDataType,
                                                                         XDataType,
                                                                         GammaDataType,
                                                                         MeanInvStdDataType,
                                                                         ComputeDataType,
                                                                         DXDataType>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
            EXPECT_TRUE(success);
        }
    }
@@ -49,3 +57,19 @@ using KernelTypes = ::testing::Types<

 TYPED_TEST_SUITE(TestgroupnormBwdData, KernelTypes);
 TYPED_TEST(TestgroupnormBwdData, Test_FP32) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp
+++ b/test/normalization_bwd_data/test_layernorm2d_bwd_data_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;

+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2dBwdData : public ::testing::Test
 {
@@ -25,16 +28,21 @@ class TestLayernorm2dBwdData : public ::testing::Test
        std::vector<std::vector<ck::index_t>> lengths = {
            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};

-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
        {
-            bool success =
-                ck::profiler::profile_layernorm_bwd_data_impl<DYDataType,
-                                                              XDataType,
-                                                              GammaDataType,
-                                                              MeanInvStdDataType,
-                                                              ComputeDataType,
-                                                              DXDataType,
-                                                              2>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_layernorm_bwd_data_impl<DYDataType,
+                                                                         XDataType,
+                                                                         GammaDataType,
+                                                                         MeanInvStdDataType,
+                                                                         ComputeDataType,
+                                                                         DXDataType,
+                                                                         2>(
+                true, 2, false, false, length, instance_index);
            EXPECT_TRUE(success);
        }
    }
@@ -46,3 +54,20 @@ using KernelTypes = ::testing::Types<

 TYPED_TEST_SUITE(TestLayernorm2dBwdData, KernelTypes);
 TYPED_TEST(TestLayernorm2dBwdData, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
+++ b/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;

+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2dBwdGammaBeta : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2dBwdGammaBeta : public ::testing::Test
        std::vector<std::vector<ck::index_t>> lengths = {
            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};

-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
        {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
            bool success = ck::profiler::profile_layernorm_bwd_gamma_beta_impl<DYDataType,
                                                                               XDataType,
                                                                               MeanInvStdDataType,
@@ -34,7 +42,7 @@ class TestLayernorm2dBwdGammaBeta : public ::testing::Test
                                                                               DGammaDataType,
                                                                               DBetaDataType,
                                                                               2>(
-                true, 2, false, false, length);
+                true, 2, false, false, length, instance_index);
            EXPECT_TRUE(success);
        }
    }
@@ -46,3 +54,20 @@ using KernelTypes = ::testing::Types<

 TYPED_TEST_SUITE(TestLayernorm2dBwdGammaBeta, KernelTypes);
 TYPED_TEST(TestLayernorm2dBwdGammaBeta, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;

+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
@@ -31,16 +34,21 @@ class TestGroupnorm : public ::testing::Test
                                                         {2, 32, 32, 32, 40},
                                                         {1, 16, 16, 32, 40}};

-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
        {
-            bool success =
-                ck::profiler::profile_groupnorm_impl<XDataType,
-                                                     GammaDataType,
-                                                     BetaDataType,
-                                                     ComputeDataType,
-                                                     YDataType,
-                                                     SaveMeanInvStdDataType,
-                                                     true>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_groupnorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true>(
+                true, 2, false, false, length, instance_index);
            EXPECT_TRUE(success);
        }
    }
@@ -52,3 +60,20 @@ using KernelTypes = ::testing::Types<

 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
 TYPED_TEST(TestGroupnorm, Test_FP16) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
+++ b/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;

+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestGroupnorm : public ::testing::Test
 {
@@ -29,16 +32,21 @@ class TestGroupnorm : public ::testing::Test
                                                         {1, 32, 32, 32, 20},
                                                         {1, 16, 16, 32, 40}};

-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
        {
-            bool success =
-                ck::profiler::profile_groupnorm_impl<XDataType,
-                                                     GammaDataType,
-                                                     BetaDataType,
-                                                     ComputeDataType,
-                                                     YDataType,
-                                                     SaveMeanInvStdDataType,
-                                                     true>(true, 2, false, false, length);
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
+            bool success = ck::profiler::profile_groupnorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true>(
+                true, 2, false, false, length, instance_index);
            EXPECT_TRUE(success);
        }
    }
@@ -50,3 +58,20 @@ using KernelTypes = ::testing::Types<

 TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
 TYPED_TEST(TestGroupnorm, Test_FP32) { this->Run(); }
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;

+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2d : public ::testing::Test
        std::vector<std::vector<ck::index_t>> lengths = {
            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};

-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
        {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
            bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                GammaDataType,
                                                                BetaDataType,
@@ -34,7 +42,8 @@ class TestLayernorm2d : public ::testing::Test
                                                                YDataType,
                                                                SaveMeanInvStdDataType,
                                                                true,
-                                                                2>(true, 2, false, false, length);
+                                                                2>(
+                true, 2, false, false, length, instance_index);
            EXPECT_TRUE(success);
        }
    }
@@ -46,3 +55,19 @@ using KernelTypes = ::testing::Types<

 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
 TYPED_TEST(TestLayernorm2d, Test_FP16) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
+++ b/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
@@ -8,6 +8,9 @@ using F16 = ck::half_t;
 using F32 = float;
 using ck::index_t;

+static ck::index_t length_mask    = 0xffff;
+static ck::index_t instance_index = -1;
+
 template <typename Tuple>
 class TestLayernorm2d : public ::testing::Test
 {
@@ -25,8 +28,13 @@ class TestLayernorm2d : public ::testing::Test
        std::vector<std::vector<ck::index_t>> lengths = {
            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};

-        for(auto length : lengths)
+        for(size_t i = 0; i < lengths.size(); i++)
        {
+            if((length_mask & (1 << i)) == 0)
+            {
+                continue;
+            }
+            auto length  = lengths[i];
            bool success = ck::profiler::profile_layernorm_impl<XDataType,
                                                                GammaDataType,
                                                                BetaDataType,
@@ -34,7 +42,8 @@ class TestLayernorm2d : public ::testing::Test
                                                                YDataType,
                                                                SaveMeanInvStdDataType,
                                                                true,
-                                                                2>(true, 2, false, false, length);
+                                                                2>(
+                true, 2, false, false, length, instance_index);
            EXPECT_TRUE(success);
        }
    }
@@ -46,3 +55,19 @@ using KernelTypes = ::testing::Types<

 TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
 TYPED_TEST(TestLayernorm2d, Test_FP32) { this->Run(); }
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        length_mask    = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: length_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/Show More
+++ b/Show More