From ed41349e19d7e51dde76d8289daeb1d66dbb7cb7 Mon Sep 17 00:00:00 2001
From: "assistant-librarian[bot]"
 <assistant-librarian[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 20:11:33 +0000
Subject: [PATCH] Merge commit 'ffe9775e7071d524caa70cf13017ebe80fe6272b' into
 develop

---
 .pre-commit-config.yaml                       | 12 ++---
 .../run_grouped_gemm_example.inc              | 45 ++++++++--------
 ...bias_relu_add_layernorm_xdl_naive_fp16.cpp | 52 +++++++++++++------
 .../gemm_layernorm_xdl_naive_fp16.cpp         | 48 +++++++++++------
 .../grouped_gemm_bias_e_permute_xdl_fp16.cpp  | 12 +++--
 example/39_permute/permute_1xHxW_fp16.cpp     | 21 +++++++-
 example/39_permute/permute_HxWx4_fp16.cpp     | 21 +++++++-
 example/39_permute/permute_NxHxW_fp16.cpp     | 21 +++++++-
 .../39_permute/run_permute_bundle_example.inc | 10 ++--
 .../run_permute_element_example.inc           | 10 ++--
 ...bias_relu_perchannel_quantization_int8.cpp | 24 ++++++++-
 ...l_bias_relu_perlayer_quantization_int8.cpp | 23 +++++++-
 ...bias_tanh_perchannel_quantization_int8.cpp | 24 ++++++++-
 ...l_bias_tanh_perlayer_quantization_int8.cpp | 23 +++++++-
 ...2d_fwd_dl_perchannel_quantization_int8.cpp | 23 +++++++-
 ...nv2d_fwd_dl_perlayer_quantization_int8.cpp | 23 +++++++-
 ...bias_relu_perchannel_quantization_int8.cpp | 24 ++++++++-
 ...l_bias_relu_perlayer_quantization_int8.cpp | 23 +++++++-
 ...d_fwd_xdl_perchannel_quantization_int8.cpp | 23 +++++++-
 ...v2d_fwd_xdl_perlayer_quantization_int8.cpp | 23 +++++++-
 ...d_bias_perchannel_quantization_example.inc |  6 +--
 ...fwd_bias_perlayer_quantization_example.inc |  6 +--
 ...2d_fwd_perchannel_quantization_example.inc |  6 +--
 ...nv2d_fwd_perlayer_quantization_example.inc |  6 +--
 .../run_groupnorm_fwd_example.inc             | 33 +++++++++---
 .../elementwise_binary_4D_fp16.cpp            | 18 ++++++-
 .../elementwise_permute_4D_fp16.cpp           | 18 ++++++-
 .../elementwise_permute_4D_fp16_col.cpp       | 18 ++++++-
 .../elementwise_permute_4D_fp16_row.cpp       | 18 ++++++-
 .../elementwise_permute_4D_fp32_col.cpp       | 18 ++++++-
 .../elementwise_permute_4D_fp32_row.cpp       | 18 ++++++-
 ...entwise_scale_permute_amax_2D_fp16_fp8.cpp | 16 ++++++
 .../elementwise_trinary_4D_fp16.cpp           | 18 ++++++-
 .../elementwise_layernorm_blockwise.cpp       | 22 +++++++-
 example/ck_tile/15_fused_moe/main.cpp         |  3 ++
 example/ck_tile/39_copy/test_tile_example.sh  |  3 ++
 .../gemm_pipeline_ag_bg_cr_comp_v5.hpp        |  3 ++
 script/dependency-parser/main.py              |  3 ++
 .../src/enhanced_ninja_parser.py              |  3 ++
 .../src/selective_test_filter.py              |  3 ++
 script/gemm_profile.sh                        |  2 +
 script/launch_tests.sh                        |  2 +
 script/ninja_json_converter.py                |  2 +
 script/remod_for_ck_tile.sh                   |  2 +
 script/remove_exec_bit.sh                     |  2 +
 .../gemm/test_gemm_pipeline_persistent.cpp    |  3 ++
 .../test_gemm_multi_d_ut_cases_cshuffle.inc   |  3 ++
 .../test_gemm_pipeline_wp.cpp                 |  3 ++
 test_data/generate_model_configs.py           |  3 ++
 test_data/generate_test_dataset.sh            |  3 ++
 test_data/miopen_to_csv.py                    |  3 ++
 test_data/run_model_with_miopen.py            |  3 ++
 52 files changed, 633 insertions(+), 122 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 664c5219e2..2d936d3a48 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,12 +6,12 @@ repos:
         entry: clang-format-18 -i --style=file
         language: system
         types_or: [c++, inc]
-    -   id: copyright-year-checker
-        name: copyright-year-checker
-        entry: script/check_copyright_year.sh
-        verbose: false
-        language: script
-        types: [c++]
+    # -   id: copyright-year-checker
+    #     name: copyright-year-checker
+    #     entry: script/check_copyright_year.sh
+    #     verbose: false
+    #     language: script
+    #     types: [c++]
     -   id: remove-exec-bit
         name: Remove executable bit from non-executable files
         entry: script/remove_exec_bit.sh
diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
index 7186c22233..4ef6074f4a 100644
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -278,6 +278,30 @@ bool run_grouped_gemm_example(int argc, char* argv[])
 
     problem_size.group_count = 16;
 
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        config.async_hargs       = std::stoi(argv[4]);
+        problem_size.group_count = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: async hargs (0=n0, 1=yes)\n");
+        printf("arg5: group count (default=16)");
+        exit(0);
+    }
+
     for(int i = 0; i < problem_size.group_count; i++)
     {
         problem_size.Ms.push_back(256 + 256 * i);
@@ -288,27 +312,6 @@ bool run_grouped_gemm_example(int argc, char* argv[])
         problem_size.stride_Bs.push_back(problem_size.Ks[i]);
         problem_size.stride_Cs.push_back(problem_size.Ns[i]);
     }
-    if(argc == 4)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 5)
-    {
-        config.do_verification = std::stoi(argv[1]);
-        config.init_method     = std::stoi(argv[2]);
-        config.time_kernel     = std::stoi(argv[3]);
-        config.async_hargs     = std::stoi(argv[4]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4: async hargs (0=n0, 1=yes)\n");
-        exit(0);
-    }
 
     return run_grouped_gemm(problem_size, config);
 }
diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
index 5dccb11bba..71a6f7f3d1 100644
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
@@ -236,7 +236,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
               << " GB/s, " << std::endl;
 }
 
-int main()
+int main(int argc, char* argv[])
 {
     // GEMM shape
     ck::index_t M = 1024;
@@ -249,6 +249,25 @@ int main()
     ck::index_t StrideD1 = 1024;
     ck::index_t StrideE  = 1024;
 
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[2]));
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
     Tensor<D0DataType> bias_n(f_host_tensor_descriptor1d(N, 1));
@@ -357,6 +376,7 @@ int main()
     normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
 
     bool pass = true;
+    if(do_verification)
     {
         // verification
         Tensor<LayerNormOutDataType> host_layerNorm_m_n(
@@ -383,27 +403,25 @@ int main()
                                      1e-2);
     }
 
+    if(time_kernel)
     {
         // evaluate kernel perf
-        bool time_kernel = true;
-
         float gemm_reduce_mean_reduce_square_mean_ave_time =
-            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
         float normalize_ave_time =
-            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});
 
-        if(time_kernel)
-            DumpGemmLayerNormPerf<ADataType,
-                                  BDataType,
-                                  EDataType,
-                                  D0DataType,
-                                  D1DataType,
-                                  R0DataType,
-                                  R1DataType,
-                                  GammaDataType,
-                                  BetaDataType,
-                                  LayerNormOutDataType>(
-                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+        DumpGemmLayerNormPerf<ADataType,
+                              BDataType,
+                              EDataType,
+                              D0DataType,
+                              D1DataType,
+                              R0DataType,
+                              R1DataType,
+                              GammaDataType,
+                              BetaDataType,
+                              LayerNormOutDataType>(
+            gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
     }
 
     return pass ? 0 : 1;
diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
index 168193ad5b..833cb48358 100644
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
@@ -221,7 +221,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
               << " GB/s, " << std::endl;
 }
 
-int main()
+int main(int argc, char* argv[])
 {
     // GEMM shape
     ck::index_t M = 1024;
@@ -232,6 +232,25 @@ int main()
     ck::index_t StrideB = 1024;
     ck::index_t StrideE = 1024;
 
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[2]));
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
     Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
@@ -333,6 +352,7 @@ int main()
     normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
 
     bool pass = true;
+    if(do_verification)
     {
         // verification
         Tensor<LayerNormOutDataType> host_layerNorm_m_n(
@@ -354,25 +374,23 @@ int main()
             layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3);
     }
 
+    if(time_kernel)
     {
         // evaluate kernel perf
-        bool time_kernel = true;
-
         float gemm_reduce_mean_reduce_square_mean_ave_time =
-            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
         float normalize_ave_time =
-            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});
 
-        if(time_kernel)
-            DumpGemmLayerNormPerf<ADataType,
-                                  BDataType,
-                                  EDataType,
-                                  R0DataType,
-                                  R1DataType,
-                                  GammaDataType,
-                                  BetaDataType,
-                                  LayerNormOutDataType>(
-                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+        DumpGemmLayerNormPerf<ADataType,
+                              BDataType,
+                              EDataType,
+                              R0DataType,
+                              R1DataType,
+                              GammaDataType,
+                              BetaDataType,
+                              LayerNormOutDataType>(
+            gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
     }
 
     return pass ? 0 : 1;
diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
index 24e9b1d9b7..37ae349f44 100644
--- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
@@ -194,22 +194,28 @@ int main(int argc, char* argv[])
     int init_method      = 1;
     bool time_kernel     = false;
 
-    if(argc == 4)
+    std::size_t group_count = rand() % 16 + 1;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 5)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
         time_kernel     = std::stoi(argv[3]);
+        group_count     = std::stoi(argv[4]);
     }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
         printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: group count (default = random from 1..16)");
         exit(0);
     }
 
-    std::size_t group_count = rand() % 16 + 1;
-
     // GEMM shape
     std::vector<ck::tensor_operation::device::ContractionDesc<1>> contraction_descs;
     std::vector<const void*> p_a, p_b;
diff --git a/example/39_permute/permute_1xHxW_fp16.cpp b/example/39_permute/permute_1xHxW_fp16.cpp
index 7336c3b631..30cf4ef083 100644
--- a/example/39_permute/permute_1xHxW_fp16.cpp
+++ b/example/39_permute/permute_1xHxW_fp16.cpp
@@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
 
 #include "run_permute_element_example.inc"
 
-int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); }
+int main(int argc, char* argv[])
+{
+    bool time_kernel = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 2)
+    {
+        time_kernel = std::stoi(argv[1]);
+    }
+    else
+    {
+        printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
+        exit(0);
+    }
+
+    return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}, time_kernel);
+}
diff --git a/example/39_permute/permute_HxWx4_fp16.cpp b/example/39_permute/permute_HxWx4_fp16.cpp
index 6c24919ded..c655384301 100644
--- a/example/39_permute/permute_HxWx4_fp16.cpp
+++ b/example/39_permute/permute_HxWx4_fp16.cpp
@@ -19,4 +19,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
 
 #include "run_permute_bundle_example.inc"
 
-int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); }
+int main(int argc, char* argv[])
+{
+    bool time_kernel = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 2)
+    {
+        time_kernel = std::stoi(argv[1]);
+    }
+    else
+    {
+        printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
+        exit(0);
+    }
+
+    return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}, time_kernel);
+}
diff --git a/example/39_permute/permute_NxHxW_fp16.cpp b/example/39_permute/permute_NxHxW_fp16.cpp
index 3551d2a7c8..d3d7f47ced 100644
--- a/example/39_permute/permute_NxHxW_fp16.cpp
+++ b/example/39_permute/permute_NxHxW_fp16.cpp
@@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
 
 #include "run_permute_element_example.inc"
 
-int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); }
+int main(int argc, char* argv[])
+{
+    bool time_kernel = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 2)
+    {
+        time_kernel = std::stoi(argv[1]);
+    }
+    else
+    {
+        printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
+        exit(0);
+    }
+
+    return !run_permute_element_example({121, 768, 80}, {0, 2, 1}, time_kernel);
+}
diff --git a/example/39_permute/run_permute_bundle_example.inc b/example/39_permute/run_permute_bundle_example.inc
index 2c19872922..fab02f8cf3 100644
--- a/example/39_permute/run_permute_bundle_example.inc
+++ b/example/39_permute/run_permute_bundle_example.inc
@@ -3,7 +3,7 @@
 
 #pragma once
 
-bool run_permute_bundle(const Problem& problem)
+bool run_permute_bundle(const Problem& problem, bool time_kernel)
 {
     const auto& input_bundle_shape = problem.shape;
     const auto& input_bundle_axes  = problem.axes;
@@ -41,7 +41,7 @@ bool run_permute_bundle(const Problem& problem)
     };
 
     auto invoker   = permute.MakeInvoker();
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::cout << "Perf: " << ave_time << " ms" << std::endl;
 
@@ -72,7 +72,9 @@ bool run_permute_bundle(const Problem& problem)
                                 1e-6);
 }
 
-bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes)
+bool run_permute_bundle_example(const Problem::Shape& shape,
+                                const Problem::Axes& axes,
+                                bool time_kernel)
 {
-    return run_permute_bundle(Problem{shape, axes});
+    return run_permute_bundle(Problem{shape, axes}, time_kernel);
 }
diff --git a/example/39_permute/run_permute_element_example.inc b/example/39_permute/run_permute_element_example.inc
index 3587134456..c3f3b972e9 100644
--- a/example/39_permute/run_permute_element_example.inc
+++ b/example/39_permute/run_permute_element_example.inc
@@ -3,7 +3,7 @@
 
 #pragma once
 
-bool run_permute_element(const Problem& problem)
+bool run_permute_element(const Problem& problem, bool time_kernel)
 {
     const auto& input_shape = problem.shape;
     const auto& input_axes  = problem.axes;
@@ -40,7 +40,7 @@ bool run_permute_element(const Problem& problem)
     };
 
     auto invoker   = permute.MakeInvoker();
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::cout << "Perf: " << ave_time << " ms" << std::endl;
 
@@ -59,7 +59,9 @@ bool run_permute_element(const Problem& problem)
                                 1e-6);
 }
 
-bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes)
+bool run_permute_element_example(const Problem::Shape& shape,
+                                 const Problem::Axes& axes,
+                                 bool time_kernel)
 {
-    return run_permute_element(Problem{shape, axes});
+    return run_permute_element(Problem{shape, axes}, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
index 4573c68658..f9a7d9f638 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
@@ -78,8 +78,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const auto out_element_op = OutElementOp{ActivationOp{}};
-    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perchannel_quantization_example(
+        out_element_op, do_verification, time_kernel);
 };
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
index 005f6263fd..333987edd6 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
@@ -76,9 +76,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float requant_scale       = 0.5f;
     const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
-    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
index 62e5e583de..4b94045421 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
@@ -79,9 +79,29 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float scale_z_inv         = 0.5f;
     const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}};
-    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perchannel_quantization_example(
+        out_element_op, do_verification, time_kernel);
 };
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
index ef98fe7e4f..b74e06b10a 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
@@ -76,10 +76,29 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float scale_acc           = 0.5f;
     float scale_z_inv         = 0.5f;
     const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}};
-    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
index e524ddb2b2..c3ac40a1bc 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp
@@ -76,8 +76,27 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const auto out_element_op = OutElementOp{ActivationOp{}};
-    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
index d29a3143c0..437fd6f4c2 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp
@@ -71,9 +71,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float requant_scale       = 0.5f;
     const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
-    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
index 8c0049b0fa..d2af8926a5 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
@@ -82,8 +82,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const auto out_element_op = OutElementOp{ActivationOp{}};
-    run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perchannel_quantization_example(
+        out_element_op, do_verification, time_kernel);
 };
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
index e18c123f7c..81ce2d57e8 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
@@ -80,9 +80,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float requant_scale       = 0.5f;
     const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
-    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
index 53f810cc9e..85e3dd9f15 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
@@ -80,8 +80,27 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perchannel_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const auto out_element_op = OutElementOp{ActivationOp{}};
-    run_conv2d_fwd_perchannel_quantization_example(out_element_op);
+    run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
index 9db6e201dd..2f333b1198 100644
--- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
@@ -75,9 +75,28 @@ using DeviceGroupedConvNDFwdInstance =
 
 #include "run_conv2d_fwd_perlayer_quantization_example.inc"
 
-int main()
+int main(int argc, char* argv[])
 {
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     float requant_scale       = 0.5f;
     const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
-    run_conv2d_fwd_perlayer_quantization_example(out_element_op);
+    run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
 }
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
index e5b924ad51..30e0791ebf 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc
@@ -167,10 +167,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
+int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op,
+                                                        bool do_verification,
+                                                        bool time_kernel)
 {
-    bool do_verification           = true;
-    bool time_kernel               = true;
     const ck::index_t ndim_spatial = 2;
 
     ck::utils::conv::ConvParam conv_param{
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
index 9f3a769dcf..32fd435e00 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc
@@ -155,10 +155,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
+int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op,
+                                                      bool do_verification,
+                                                      bool time_kernel)
 {
-    bool do_verification           = true;
-    bool time_kernel               = true;
     const ck::index_t ndim_spatial = 2;
 
     ck::utils::conv::ConvParam conv_param{
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
index 9b08fc690d..362d90b4c1 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc
@@ -157,10 +157,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
+int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op,
+                                                   bool do_verification,
+                                                   bool time_kernel)
 {
-    bool do_verification           = true;
-    bool time_kernel               = true;
     const ck::index_t ndim_spatial = 2;
 
     ck::utils::conv::ConvParam conv_param{
diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
index 267c737e00..eae6e996cc 100644
--- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
+++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc
@@ -139,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     return (pass ? 0 : 1);
 }
 
-int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
+int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op,
+                                                 bool do_verification,
+                                                 bool time_kernel)
 {
-    bool do_verification           = true;
-    bool time_kernel               = false;
     const ck::index_t ndim_spatial = 2;
 
     ck::utils::conv::ConvParam conv_param{
diff --git a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
index ab6f317bc6..86e1c8ccc8 100644
--- a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
+++ b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
@@ -11,21 +11,36 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
     ck::index_t G = 64;
     ck::index_t C = 128;
 
+    bool do_verification = true;
+    bool time_kernel     = true;
+    bool log_kernel      = true;
+
     if(argc == 1)
     {
         // use default case
     }
-    else if(argc == 6)
+    else if(argc == 4)
     {
-        N = std::stoi(argv[1]);
-        H = std::stoi(argv[2]);
-        W = std::stoi(argv[3]);
-        G = std::stoi(argv[4]);
-        C = std::stoi(argv[5]);
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+        log_kernel      = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+        log_kernel      = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        H               = std::stoi(argv[5]);
+        W               = std::stoi(argv[6]);
+        G               = std::stoi(argv[7]);
+        C               = std::stoi(argv[8]);
     }
     else
     {
-        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+        std::cerr << "arg1 = verify(0=no, 1=yes), arg2 = time kernels(0=no, 1=yes), arg3 = log "
+                     "kernels(0=no, 1=yes), arg4 to 8: N, H, W, G, C"
+                  << std::endl;
 
         return 1;
     }
@@ -94,7 +109,8 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
     device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
 
     auto invoker_ptr = device_instance.MakeInvokerPointer();
-    float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
+    float ave_time =
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel, log_kernel});
 
     std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
                             sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
@@ -106,6 +122,7 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
               << device_instance.GetTypeString() << std::endl;
 
     bool pass = true;
+    if(do_verification)
     {
         Tensor<YDataType> host_y({N, H, W, G, C});
         Tensor<SaveMeanInvStdDataType> host_save_mean(HostTensorDescriptor{N, G});
diff --git a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
index 8819bb65e6..1564bcb006 100644
--- a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp
@@ -44,11 +44,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8, 8>,              // InScalarPerVectorSeq
     ck::Sequence<8>>;                // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::array<ck::index_t, 4> ab_lengths;
     std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
index 3ea1aa4bf8..9e92543252 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -37,11 +37,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8>,      // InScalarPerVectorSeq
     ck::Sequence<8>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 128};
 
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
index 13c67fce05..88c23b5f40 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
@@ -41,11 +41,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8>,      // InScalarPerVectorSeq
     ck::Sequence<8>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 8, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 8};
     std::array<ck::index_t, 4> ab_lengths;
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
index 0a0f6fec10..1185b5a3ca 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8>,      // InScalarPerVectorSeq
     ck::Sequence<8>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 128};
 
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
index fc664186be..28a3dbc44c 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<1>,      // InScalarPerVectorSeq
     ck::Sequence<1>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 8, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 8};
     std::array<ck::index_t, 4> ab_lengths;
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
index a0c416318a..14d1d96165 100644
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8>,      // InScalarPerVectorSeq
     ck::Sequence<8>>;     // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::vector<std::size_t> nhwc = {16, 32, 64, 128};
 
diff --git a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
index c40447e1f9..0619cc7139 100644
--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
@@ -119,6 +119,22 @@ int main(int argc, char* argv[])
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     const float scale = 2.f;
 
     ck::index_t M = 1024;
diff --git a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
index 050300eed2..2583f1cb5e 100644
--- a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp
@@ -48,11 +48,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
     ck::Sequence<8, 8, 8>,                      // InScalarPerVectorSeq
     ck::Sequence<8>>;                           // OutScalarPerVectorSeq
 
-int main()
+int main(int argc, char* argv[])
 {
     bool do_verification = true;
     bool time_kernel     = true;
 
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
     std::vector<std::size_t> nchw = {16, 128, 32, 64};
     std::array<ck::index_t, 4> ab_lengths;
     std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
index c02d540983..51006e676b 100644
--- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
@@ -77,9 +77,26 @@ void host_elementwise2D(HostTensorC& C,
         }
 }
 
-int main()
+int main(int argc, char* argv[])
 {
-    bool time_kernel = true;
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
 
     ck::index_t M      = 48 * 256;
     ck::index_t N      = 1024;
@@ -157,6 +174,7 @@ int main()
     std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl;
 
     bool pass = true;
+    if(do_verification)
     {
         std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
                                        static_cast<unsigned long>(N)};
diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp
index b6cc3b6543..5129b46231 100644
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include <algorithm>
 #include <cstring>
 #include <unordered_set>
diff --git a/example/ck_tile/39_copy/test_tile_example.sh b/example/ck_tile/39_copy/test_tile_example.sh
index fcd8c8e991..416338fac4 100755
--- a/example/ck_tile/39_copy/test_tile_example.sh
+++ b/example/ck_tile/39_copy/test_tile_example.sh
@@ -1,4 +1,7 @@
 #!/usr/bin/env bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 set -euo pipefail
 
 BIN="${BIN:-../../../build/bin/tile_example_copy}"
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index b05145890f..b83d37a790 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
diff --git a/script/dependency-parser/main.py b/script/dependency-parser/main.py
index b8fd67ac49..5c956bca00 100644
--- a/script/dependency-parser/main.py
+++ b/script/dependency-parser/main.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Unified CLI for Ninja Dependency Analysis and Selective Testing
 
diff --git a/script/dependency-parser/src/enhanced_ninja_parser.py b/script/dependency-parser/src/enhanced_ninja_parser.py
index 087ab50640..725768a61f 100644
--- a/script/dependency-parser/src/enhanced_ninja_parser.py
+++ b/script/dependency-parser/src/enhanced_ninja_parser.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Enhanced Ninja Dependency Parser
 
diff --git a/script/dependency-parser/src/selective_test_filter.py b/script/dependency-parser/src/selective_test_filter.py
index f364d60d27..e8698d115d 100644
--- a/script/dependency-parser/src/selective_test_filter.py
+++ b/script/dependency-parser/src/selective_test_filter.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Selective Test Filter Tool
 
diff --git a/script/gemm_profile.sh b/script/gemm_profile.sh
index 487b90d640..89419ca711 100755
--- a/script/gemm_profile.sh
+++ b/script/gemm_profile.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 BIN=./bin/tile_example_gemm_weight_preshuffle
 PREC=fp8
diff --git a/script/launch_tests.sh b/script/launch_tests.sh
index 829ac82378..5e71e25478 100755
--- a/script/launch_tests.sh
+++ b/script/launch_tests.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 # Get the directory where the script is located
 BUILD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
diff --git a/script/ninja_json_converter.py b/script/ninja_json_converter.py
index 92660dc7b3..7bfb2f867b 100644
--- a/script/ninja_json_converter.py
+++ b/script/ninja_json_converter.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 """
 Converts .ninja_log files into Chrome's about:tracing format.
diff --git a/script/remod_for_ck_tile.sh b/script/remod_for_ck_tile.sh
index 5c7a78d0cc..b017d2e1d6 100755
--- a/script/remod_for_ck_tile.sh
+++ b/script/remod_for_ck_tile.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 # Get list of staged files
 STAGED_FILES=$(git diff --cached --name-only)
diff --git a/script/remove_exec_bit.sh b/script/remove_exec_bit.sh
index 25466d8c37..5b0035c8b8 100755
--- a/script/remove_exec_bit.sh
+++ b/script/remove_exec_bit.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 for file in $(git diff --cached --name-only --diff-filter=ACM | grep -E '\.(cpp|hpp|txt|inc)$'); do
     if [ -x "$file" ]; then
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
index 54410acf70..b3d433c466 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "test_gemm_pipeline_kernel_types.hpp"
 #include "test_gemm_pipeline_util.hpp"
 #include "gtest/gtest.h"
diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc
index 8d21c65692..798bbb1116 100644
--- a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc
+++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #pragma once
 
 TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDKBatch1CShuffle_256x512x256)
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp
index de71c4682d..d836c501ae 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "test_gemm_pipeline_kernel_types.hpp"
 #include "test_gemm_pipeline_util.hpp"
 #include "gtest/gtest.h"
diff --git a/test_data/generate_model_configs.py b/test_data/generate_model_configs.py
index 125655cef4..f852d781d6 100644
--- a/test_data/generate_model_configs.py
+++ b/test_data/generate_model_configs.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Generate Model Configuration Combinations for MIOpen Testing
 
diff --git a/test_data/generate_test_dataset.sh b/test_data/generate_test_dataset.sh
index 3fb8fa027b..1124311feb 100755
--- a/test_data/generate_test_dataset.sh
+++ b/test_data/generate_test_dataset.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 # Generate Comprehensive Convolution Test Dataset for CK
 # This script captures MIOpen commands from PyTorch models and generates test cases
 
diff --git a/test_data/miopen_to_csv.py b/test_data/miopen_to_csv.py
index ae8c187b43..3292584548 100644
--- a/test_data/miopen_to_csv.py
+++ b/test_data/miopen_to_csv.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Convert MIOpen Driver Commands to CSV Test Cases
 
diff --git a/test_data/run_model_with_miopen.py b/test_data/run_model_with_miopen.py
index 3d96e19f2f..596f6a4a37 100644
--- a/test_data/run_model_with_miopen.py
+++ b/test_data/run_model_with_miopen.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 PyTorch Model Runner with MIOpen Command Logging using torchvision models