From 2273f06ad658bef026f5330d7d62e7e13083d4a1 Mon Sep 17 00:00:00 2001 From: Johannes Graner Date: Wed, 7 Jan 2026 16:30:57 +0100 Subject: [PATCH] [CI, CK examples] Disable time_kernel for CI tests and examples (#3464) * Disable kernel timing in tests * default time_kernel = false in old CK examples [ROCm/composable_kernel commit: 0a474aa62f6dd3f4b95bb405f0a8f1d457a4c0eb] --- example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp | 2 +- example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp | 2 +- example/12_reduce/reduce_blockwise.cpp | 2 +- example/12_reduce/reduce_multiblock_atomic_add.cpp | 2 +- example/12_reduce/reduce_threadwise_multi_d.cpp | 2 +- example/13_pool2d_fwd/pool2d_fwd_fp16.cpp | 2 +- .../15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp | 2 +- example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp | 2 +- .../gemm_add_add_mean_meansquare_xdl_fp16.cpp | 2 +- .../gemm_add_addsquare_xdl_int8.cpp | 2 +- example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp | 2 +- example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp | 2 +- example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp | 2 +- example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp | 2 +- example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp | 2 +- .../gemm_mean_meansquare_xdl_bf16.cpp | 2 +- .../gemm_mean_meansquare_xdl_fp16.cpp | 2 +- .../gemm_mean_meansquare_xdl_fp32.cpp | 2 +- example/22_cgemm/cgemm_xdl_int4.cpp | 2 +- example/23_softmax/softmax_blockwise.cpp | 2 +- .../run_batched_gemm_example_fp16int4_b_scale.inc | 2 +- .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 2 +- example/30_grouped_conv_fwd_multiple_d/common.hpp | 2 +- example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp | 2 +- example/33_multiple_reduce/dual_reduce_common.hpp | 2 +- example/35_splitK_gemm/common.hpp | 2 +- .../36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp | 2 +- .../batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp | 2 +- .../conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp | 2 +- .../conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp | 2 +- .../conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp | 2 +- .../conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp | 2 +- .../conv2d_fwd_dl_perchannel_quantization_int8.cpp | 2 +- .../conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp | 2 +- .../conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp | 2 +- .../conv2d_fwd_xdl_perchannel_quantization_int8.cpp | 2 +- example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc | 2 +- example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp | 2 +- example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp | 2 +- .../44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp | 2 +- .../44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp | 2 +- .../44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp | 2 +- .../44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp | 2 +- .../elementwise_scale_permute_amax_2D_fp16_fp8.cpp | 2 +- example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp | 2 +- .../elementwise_layernorm_blockwise.cpp | 2 +- example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp | 2 +- .../65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp | 2 +- example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp | 2 +- example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp | 2 +- .../65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp | 2 +- example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp | 2 +- example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp | 2 +- example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp | 2 +- .../67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp | 2 +- example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp | 2 +- example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp | 2 +- .../67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp | 2 +- test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp | 2 +- test/gemm/gemm_standalone_xdl_fp16.cpp | 2 +- test/wrapper/test_wrapper_gemm_xdl.cpp | 2 +- 61 files changed, 61 insertions(+), 61 deletions(-) diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 0bded7d2ac..9b48d5765d 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -119,7 +119,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp index 4acf4fe9ff..a770bf5c77 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp @@ -119,7 +119,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index 55f3d99823..f8299028da 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -31,7 +31,7 @@ class SimpleAppArgs bool do_verification = true; int data_type = 1; int init_method = 2; - bool time_kernel = true; + bool time_kernel = false; public: void show_usage(const char* cmd) diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/example/12_reduce/reduce_multiblock_atomic_add.cpp index af5903f83c..66fc2bb582 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add.cpp +++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp @@ -31,7 +31,7 @@ class SimpleAppArgs bool do_verification = true; int data_type = 1; int init_method = 2; - bool time_kernel = true; + bool time_kernel = false; public: void show_usage(const char* cmd) diff --git a/example/12_reduce/reduce_threadwise_multi_d.cpp b/example/12_reduce/reduce_threadwise_multi_d.cpp index e77daea212..ee06395771 100644 --- a/example/12_reduce/reduce_threadwise_multi_d.cpp +++ b/example/12_reduce/reduce_threadwise_multi_d.cpp @@ -31,7 +31,7 @@ class SimpleAppArgs bool do_verification = true; int data_type = 1; int init_method = 2; - bool time_kernel = true; + bool time_kernel = false; public: void show_usage(const char* cmd) diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp index f0a9ce9270..fc083ba3e2 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp @@ -53,7 +53,7 @@ int main(int argc, char* argv[]) { do_verification = true; init_method = 1; - time_kernel = true; + time_kernel = false; } else if(argc == 4) { diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp index 62d2022084..6fe285f165 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp @@ -90,7 +90,7 @@ struct ExecutionConfig final bool do_verification = true; int init_method = 1; int k_batch = 128; - bool time_kernel = true; + bool time_kernel = false; }; bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp index 1db8a9defb..0e1a38b19a 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp @@ -89,7 +89,7 @@ struct ExecutionConfig final { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; }; bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp index 08915fdd26..a30bedf282 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp @@ -268,7 +268,7 @@ int main() pass &= ck::utils::check_err(r1_m, r1_m_host, "Error: Incorrect results d1", 1e-2, 1e-2); } - bool time_kernel = true; + bool time_kernel = false; if(time_kernel) { float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp index 7a81d82c25..3401494625 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp @@ -302,7 +302,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp index 5a127d1cd4..e4960668eb 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp @@ -106,7 +106,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp index 29be3dde0a..c97fa7ebc5 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp @@ -106,7 +106,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp index 0574488e04..f32d5e9f6d 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp @@ -106,7 +106,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp index 7da40adc90..6c9fb8da75 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp @@ -108,7 +108,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp index 47f1d50ef5..4a63bee894 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp index cac3db3078..ebd71f1799 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp @@ -112,7 +112,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp index 5ea09cfab2..1153a66615 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp @@ -112,7 +112,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp index 8e120851ec..6b5dde3cc7 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp @@ -112,7 +112,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/22_cgemm/cgemm_xdl_int4.cpp b/example/22_cgemm/cgemm_xdl_int4.cpp index 47b0e1d5a5..4f21c70562 100644 --- a/example/22_cgemm/cgemm_xdl_int4.cpp +++ b/example/22_cgemm/cgemm_xdl_int4.cpp @@ -81,7 +81,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // CGEMM shape ck::index_t M = 1024; diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp index a741cb8133..0455819cdc 100644 --- a/example/23_softmax/softmax_blockwise.cpp +++ b/example/23_softmax/softmax_blockwise.cpp @@ -65,7 +65,7 @@ class SimpleAppArgs bool do_verification = true; int init_method = 2; - bool time_kernel = true; + bool time_kernel = false; public: void show_usage(const char* cmd) diff --git a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc index 12d7cf0aa6..86a36d53e2 100644 --- a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc +++ b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc @@ -27,7 +27,7 @@ struct ExecutionConfig final { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; }; template diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index 6efed7eb29..06bf971ac4 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -248,7 +248,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; ck::index_t G0 = 1; ck::index_t G1 = 2; diff --git a/example/30_grouped_conv_fwd_multiple_d/common.hpp b/example/30_grouped_conv_fwd_multiple_d/common.hpp index e1939d4300..dce9f62293 100644 --- a/example/30_grouped_conv_fwd_multiple_d/common.hpp +++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp @@ -92,7 +92,7 @@ struct ExecutionConfig final { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; }; #define DefaultConvParam \ diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp index ca8cba039f..2b27405ecd 100644 --- a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp +++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp @@ -92,7 +92,7 @@ struct ExecutionConfig final { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; }; #define DefaultConvParam \ diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/example/33_multiple_reduce/dual_reduce_common.hpp index 3f04af5e89..923b5b6f15 100644 --- a/example/33_multiple_reduce/dual_reduce_common.hpp +++ b/example/33_multiple_reduce/dual_reduce_common.hpp @@ -40,7 +40,7 @@ class SimpleAppArgs bool do_verification = true; int init_method = 2; - bool time_kernel = true; + bool time_kernel = false; public: SimpleAppArgs() diff --git a/example/35_splitK_gemm/common.hpp b/example/35_splitK_gemm/common.hpp index d0f03f3611..8bf09ee786 100644 --- a/example/35_splitK_gemm/common.hpp +++ b/example/35_splitK_gemm/common.hpp @@ -44,7 +44,7 @@ struct ExecutionConfig final { bool do_verification = true; int init_method = 2; - bool time_kernel = true; + bool time_kernel = false; }; template diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp index 2f290497c9..ea8858b958 100644 --- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp +++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp @@ -56,7 +56,7 @@ template<> struct emb_kernel { using kernel_type = DeviceInsta int main(int argc, char* argv[]) { - bool time_kernel = true; + bool time_kernel = false; ck::index_t num_rows = 65536; constexpr auto dims = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{}; diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp index dc0b95863e..ab87124c6b 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp +++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp @@ -195,7 +195,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t M = 1024; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp index c6cc9c6a15..9e7039461c 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp @@ -86,7 +86,7 @@ using DeviceGroupedConvNDFwdInstance = int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp index 0f49cb5a38..fa6a36c212 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp @@ -84,7 +84,7 @@ using DeviceGroupedConvNDFwdInstance = int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp index 5652cc38ab..45651da757 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp @@ -87,7 +87,7 @@ using DeviceGroupedConvNDFwdInstance = int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp index 138a214127..cda4c1419c 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp @@ -84,7 +84,7 @@ using DeviceGroupedConvNDFwdInstance = int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp index 1652cea214..0e52ac280a 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp @@ -84,7 +84,7 @@ using DeviceGroupedConvNDFwdInstance = int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp index f127940377..9bff452a67 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp @@ -90,7 +90,7 @@ using DeviceGroupedConvNDFwdInstance = int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp index 7a03a3efe0..17a7b632af 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp @@ -88,7 +88,7 @@ using DeviceGroupedConvNDFwdInstance = int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp index 155024dc62..345277e092 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp @@ -88,7 +88,7 @@ using DeviceGroupedConvNDFwdInstance = int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc index b1596b5a53..d5f9b831f0 100644 --- a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc +++ b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc @@ -12,7 +12,7 @@ int run_groupnorm_fwd_example(int argc, char* argv[]) ck::index_t C = 128; bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; bool log_kernel = true; if(argc == 1) diff --git a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp index 14b338c9c5..e90880dabd 100644 --- a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp @@ -53,7 +53,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; std::vector nchw = {16, 128, 32, 64}; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp index a7d139fc95..2b99d9261f 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp @@ -46,7 +46,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp index cd1db4cdaf..276aa7f3c7 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp @@ -50,7 +50,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp index 683c5cb072..0842325bad 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp @@ -50,7 +50,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp index abfd3ccf7c..a48f2349c9 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp @@ -49,7 +49,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp index ff4e8f3a3d..39d88c47a1 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp @@ -50,7 +50,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp index 939860bf69..3aef0fdaac 100644 --- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp +++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp @@ -121,7 +121,7 @@ void reference_scale_permute_amax(Tensor& input, int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; const float scale = 2.f; diff --git a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp index 497f1c67c8..86af00e4fb 100644 --- a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp @@ -58,7 +58,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; if(argc == 1) { diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp index eb95128f38..71cee9c420 100644 --- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp +++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp @@ -84,7 +84,7 @@ void host_elementwise2D(HostTensorC& C, int main(int argc, char* argv[]) { bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; ck::index_t M = 48 * 256; ck::index_t N = 1024; diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp index c0452b6067..10f7a38863 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp @@ -205,7 +205,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // GEMM shape ck::index_t N = 4096; diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp index ecc3034bba..d6082e5882 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp @@ -193,7 +193,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; #if 1 // GEMM shape ck::index_t N = 4096; diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp index 0067c1d1fb..a2002270dc 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp @@ -194,7 +194,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp index a602838c30..9f4cd13573 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp @@ -185,7 +185,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp index fb5e3b6456..552d3cd7b5 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp @@ -188,7 +188,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // tokens = 1 // topk = 1 diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp index f56410d37a..377b53b519 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp @@ -164,7 +164,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp index 3ce059ba20..586ecd81bf 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp @@ -178,7 +178,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp index d1d601977d..b3b2ebcbc0 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp @@ -178,7 +178,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp index 0078cc5625..5c7668ab73 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp @@ -208,7 +208,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp index 202241d14f..04c3afc62b 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp @@ -171,7 +171,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp index 660ccabc94..12bb76eccd 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp @@ -171,7 +171,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp index f398959114..6a5f5a6b9f 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp @@ -204,7 +204,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; // per expert: // GEMM shape diff --git a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp index e26ac53abe..2403c564b7 100644 --- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp +++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp @@ -61,7 +61,7 @@ class TestBatchedGemmMultiD : public ::testing::Test true, // do_verification 1, // init_method false, // do_log - 1, // time_kernel, + false, // time_kernel, M, N, K, diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/test/gemm/gemm_standalone_xdl_fp16.cpp index 90a5a325b8..2df67a083a 100644 --- a/test/gemm/gemm_standalone_xdl_fp16.cpp +++ b/test/gemm/gemm_standalone_xdl_fp16.cpp @@ -104,7 +104,7 @@ int main(int argc, char* argv[]) }; bool do_verification = true; - bool time_kernel = true; + bool time_kernel = false; int problem_index = -1; if(argc == 1) diff --git a/test/wrapper/test_wrapper_gemm_xdl.cpp b/test/wrapper/test_wrapper_gemm_xdl.cpp index b9d4bc3e57..b8965a217b 100644 --- a/test/wrapper/test_wrapper_gemm_xdl.cpp +++ b/test/wrapper/test_wrapper_gemm_xdl.cpp @@ -306,7 +306,7 @@ void PerformGemm(const ck::index_t M, const auto kernel = DeviceGemm; - const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true}, + const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, false}, kernel, dim3(grid_size_x, grid_size_y, 1), dim3(ck::wrapper::size(thread_layout)),