From ed41349e19d7e51dde76d8289daeb1d66dbb7cb7 Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Thu, 11 Sep 2025 20:11:33 +0000 Subject: [PATCH] Merge commit 'ffe9775e7071d524caa70cf13017ebe80fe6272b' into develop --- .pre-commit-config.yaml | 12 ++--- .../run_grouped_gemm_example.inc | 45 ++++++++-------- ...bias_relu_add_layernorm_xdl_naive_fp16.cpp | 52 +++++++++++++------ .../gemm_layernorm_xdl_naive_fp16.cpp | 48 +++++++++++------ .../grouped_gemm_bias_e_permute_xdl_fp16.cpp | 12 +++-- example/39_permute/permute_1xHxW_fp16.cpp | 21 +++++++- example/39_permute/permute_HxWx4_fp16.cpp | 21 +++++++- example/39_permute/permute_NxHxW_fp16.cpp | 21 +++++++- .../39_permute/run_permute_bundle_example.inc | 10 ++-- .../run_permute_element_example.inc | 10 ++-- ...bias_relu_perchannel_quantization_int8.cpp | 24 ++++++++- ...l_bias_relu_perlayer_quantization_int8.cpp | 23 +++++++- ...bias_tanh_perchannel_quantization_int8.cpp | 24 ++++++++- ...l_bias_tanh_perlayer_quantization_int8.cpp | 23 +++++++- ...2d_fwd_dl_perchannel_quantization_int8.cpp | 23 +++++++- ...nv2d_fwd_dl_perlayer_quantization_int8.cpp | 23 +++++++- ...bias_relu_perchannel_quantization_int8.cpp | 24 ++++++++- ...l_bias_relu_perlayer_quantization_int8.cpp | 23 +++++++- ...d_fwd_xdl_perchannel_quantization_int8.cpp | 23 +++++++- ...v2d_fwd_xdl_perlayer_quantization_int8.cpp | 23 +++++++- ...d_bias_perchannel_quantization_example.inc | 6 +-- ...fwd_bias_perlayer_quantization_example.inc | 6 +-- ...2d_fwd_perchannel_quantization_example.inc | 6 +-- ...nv2d_fwd_perlayer_quantization_example.inc | 6 +-- .../run_groupnorm_fwd_example.inc | 33 +++++++++--- .../elementwise_binary_4D_fp16.cpp | 18 ++++++- .../elementwise_permute_4D_fp16.cpp | 18 ++++++- .../elementwise_permute_4D_fp16_col.cpp | 18 ++++++- .../elementwise_permute_4D_fp16_row.cpp | 18 ++++++- .../elementwise_permute_4D_fp32_col.cpp | 18 ++++++- .../elementwise_permute_4D_fp32_row.cpp | 18 ++++++- ...entwise_scale_permute_amax_2D_fp16_fp8.cpp | 16 ++++++ .../elementwise_trinary_4D_fp16.cpp | 18 ++++++- .../elementwise_layernorm_blockwise.cpp | 22 +++++++- example/ck_tile/15_fused_moe/main.cpp | 3 ++ example/ck_tile/39_copy/test_tile_example.sh | 3 ++ .../gemm_pipeline_ag_bg_cr_comp_v5.hpp | 3 ++ script/dependency-parser/main.py | 3 ++ .../src/enhanced_ninja_parser.py | 3 ++ .../src/selective_test_filter.py | 3 ++ script/gemm_profile.sh | 2 + script/launch_tests.sh | 2 + script/ninja_json_converter.py | 2 + script/remod_for_ck_tile.sh | 2 + script/remove_exec_bit.sh | 2 + .../gemm/test_gemm_pipeline_persistent.cpp | 3 ++ .../test_gemm_multi_d_ut_cases_cshuffle.inc | 3 ++ .../test_gemm_pipeline_wp.cpp | 3 ++ test_data/generate_model_configs.py | 3 ++ test_data/generate_test_dataset.sh | 3 ++ test_data/miopen_to_csv.py | 3 ++ test_data/run_model_with_miopen.py | 3 ++ 52 files changed, 633 insertions(+), 122 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 664c5219e2..2d936d3a48 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,12 +6,12 @@ repos: entry: clang-format-18 -i --style=file language: system types_or: [c++, inc] - - id: copyright-year-checker - name: copyright-year-checker - entry: script/check_copyright_year.sh - verbose: false - language: script - types: [c++] + # - id: copyright-year-checker + # name: copyright-year-checker + # entry: script/check_copyright_year.sh + # verbose: false + # language: script + # types: [c++] - id: remove-exec-bit name: Remove executable bit from non-executable files entry: script/remove_exec_bit.sh diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc index 7186c22233..4ef6074f4a 100644 --- a/example/15_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc @@ -278,6 +278,30 @@ bool run_grouped_gemm_example(int argc, char* argv[]) problem_size.group_count = 16; + if(argc == 4) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + } + else if(argc == 6) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + config.async_hargs = std::stoi(argv[4]); + problem_size.group_count = std::stoi(argv[5]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); + printf("arg4: async hargs (0=n0, 1=yes)\n"); + printf("arg5: group count (default=16)"); + exit(0); + } + for(int i = 0; i < problem_size.group_count; i++) { problem_size.Ms.push_back(256 + 256 * i); @@ -288,27 +312,6 @@ bool run_grouped_gemm_example(int argc, char* argv[]) problem_size.stride_Bs.push_back(problem_size.Ks[i]); problem_size.stride_Cs.push_back(problem_size.Ns[i]); } - if(argc == 4) - { - config.do_verification = std::stoi(argv[1]); - config.init_method = std::stoi(argv[2]); - config.time_kernel = std::stoi(argv[3]); - } - else if(argc == 5) - { - config.do_verification = std::stoi(argv[1]); - config.init_method = std::stoi(argv[2]); - config.time_kernel = std::stoi(argv[3]); - config.async_hargs = std::stoi(argv[4]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: time kernel (0=n0, 1=yes)\n"); - printf("arg4: async hargs (0=n0, 1=yes)\n"); - exit(0); - } return run_grouped_gemm(problem_size, config); } diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp index 5dccb11bba..71a6f7f3d1 100644 --- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp @@ -236,7 +236,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M, << " GB/s, " << std::endl; } -int main() +int main(int argc, char* argv[]) { // GEMM shape ck::index_t M = 1024; @@ -249,6 +249,25 @@ int main() ck::index_t StrideD1 = 1024; ck::index_t StrideE = 1024; + bool do_verification = true; + bool time_kernel = false; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = static_cast(std::stoi(argv[2])); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + Tensor a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{})); Tensor bias_n(f_host_tensor_descriptor1d(N, 1)); @@ -357,6 +376,7 @@ int main() normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false}); bool pass = true; + if(do_verification) { // verification Tensor host_layerNorm_m_n( @@ -383,27 +403,25 @@ int main() 1e-2); } + if(time_kernel) { // evaluate kernel perf - bool time_kernel = true; - float gemm_reduce_mean_reduce_square_mean_ave_time = - gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel}); + gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true}); float normalize_ave_time = - normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true}); - if(time_kernel) - DumpGemmLayerNormPerf( - gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K); + DumpGemmLayerNormPerf( + gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K); } return pass ? 0 : 1; diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp index 168193ad5b..833cb48358 100644 --- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp @@ -221,7 +221,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M, << " GB/s, " << std::endl; } -int main() +int main(int argc, char* argv[]) { // GEMM shape ck::index_t M = 1024; @@ -232,6 +232,25 @@ int main() ck::index_t StrideB = 1024; ck::index_t StrideE = 1024; + bool do_verification = true; + bool time_kernel = false; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = static_cast(std::stoi(argv[2])); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + Tensor a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{})); Tensor e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{})); @@ -333,6 +352,7 @@ int main() normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false}); bool pass = true; + if(do_verification) { // verification Tensor host_layerNorm_m_n( @@ -354,25 +374,23 @@ int main() layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3); } + if(time_kernel) { // evaluate kernel perf - bool time_kernel = true; - float gemm_reduce_mean_reduce_square_mean_ave_time = - gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel}); + gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true}); float normalize_ave_time = - normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true}); - if(time_kernel) - DumpGemmLayerNormPerf( - gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K); + DumpGemmLayerNormPerf( + gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K); } return pass ? 0 : 1; diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp index 24e9b1d9b7..37ae349f44 100644 --- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp @@ -194,22 +194,28 @@ int main(int argc, char* argv[]) int init_method = 1; bool time_kernel = false; - if(argc == 4) + std::size_t group_count = rand() % 16 + 1; + + if(argc == 1) + { + // use default + } + else if(argc == 5) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); + group_count = std::stoi(argv[4]); } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n"); + printf("arg4: group count (default = random from 1..16)"); exit(0); } - std::size_t group_count = rand() % 16 + 1; - // GEMM shape std::vector> contraction_descs; std::vector p_a, p_b; diff --git a/example/39_permute/permute_1xHxW_fp16.cpp b/example/39_permute/permute_1xHxW_fp16.cpp index 7336c3b631..30cf4ef083 100644 --- a/example/39_permute/permute_1xHxW_fp16.cpp +++ b/example/39_permute/permute_1xHxW_fp16.cpp @@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl #include "run_permute_element_example.inc" -int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); } +int main(int argc, char* argv[]) +{ + bool time_kernel = false; + + if(argc == 1) + { + // use default + } + else if(argc == 2) + { + time_kernel = std::stoi(argv[1]); + } + else + { + printf("arg1: time kernel (0=no, 1=yes, default=0)\n"); + exit(0); + } + + return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}, time_kernel); +} diff --git a/example/39_permute/permute_HxWx4_fp16.cpp b/example/39_permute/permute_HxWx4_fp16.cpp index 6c24919ded..c655384301 100644 --- a/example/39_permute/permute_HxWx4_fp16.cpp +++ b/example/39_permute/permute_HxWx4_fp16.cpp @@ -19,4 +19,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl #include "run_permute_bundle_example.inc" -int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); } +int main(int argc, char* argv[]) +{ + bool time_kernel = false; + + if(argc == 1) + { + // use default + } + else if(argc == 2) + { + time_kernel = std::stoi(argv[1]); + } + else + { + printf("arg1: time kernel (0=no, 1=yes, default=0)\n"); + exit(0); + } + + return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}, time_kernel); +} diff --git a/example/39_permute/permute_NxHxW_fp16.cpp b/example/39_permute/permute_NxHxW_fp16.cpp index 3551d2a7c8..d3d7f47ced 100644 --- a/example/39_permute/permute_NxHxW_fp16.cpp +++ b/example/39_permute/permute_NxHxW_fp16.cpp @@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl #include "run_permute_element_example.inc" -int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); } +int main(int argc, char* argv[]) +{ + bool time_kernel = false; + + if(argc == 1) + { + // use default + } + else if(argc == 2) + { + time_kernel = std::stoi(argv[1]); + } + else + { + printf("arg1: time kernel (0=no, 1=yes, default=0)\n"); + exit(0); + } + + return !run_permute_element_example({121, 768, 80}, {0, 2, 1}, time_kernel); +} diff --git a/example/39_permute/run_permute_bundle_example.inc b/example/39_permute/run_permute_bundle_example.inc index 2c19872922..fab02f8cf3 100644 --- a/example/39_permute/run_permute_bundle_example.inc +++ b/example/39_permute/run_permute_bundle_example.inc @@ -3,7 +3,7 @@ #pragma once -bool run_permute_bundle(const Problem& problem) +bool run_permute_bundle(const Problem& problem, bool time_kernel) { const auto& input_bundle_shape = problem.shape; const auto& input_bundle_axes = problem.axes; @@ -41,7 +41,7 @@ bool run_permute_bundle(const Problem& problem) }; auto invoker = permute.MakeInvoker(); - float ave_time = invoker.Run(argument, StreamConfig{nullptr, true}); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::cout << "Perf: " << ave_time << " ms" << std::endl; @@ -72,7 +72,9 @@ bool run_permute_bundle(const Problem& problem) 1e-6); } -bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes) +bool run_permute_bundle_example(const Problem::Shape& shape, + const Problem::Axes& axes, + bool time_kernel) { - return run_permute_bundle(Problem{shape, axes}); + return run_permute_bundle(Problem{shape, axes}, time_kernel); } diff --git a/example/39_permute/run_permute_element_example.inc b/example/39_permute/run_permute_element_example.inc index 3587134456..c3f3b972e9 100644 --- a/example/39_permute/run_permute_element_example.inc +++ b/example/39_permute/run_permute_element_example.inc @@ -3,7 +3,7 @@ #pragma once -bool run_permute_element(const Problem& problem) +bool run_permute_element(const Problem& problem, bool time_kernel) { const auto& input_shape = problem.shape; const auto& input_axes = problem.axes; @@ -40,7 +40,7 @@ bool run_permute_element(const Problem& problem) }; auto invoker = permute.MakeInvoker(); - float ave_time = invoker.Run(argument, StreamConfig{nullptr, true}); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::cout << "Perf: " << ave_time << " ms" << std::endl; @@ -59,7 +59,9 @@ bool run_permute_element(const Problem& problem) 1e-6); } -bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes) +bool run_permute_element_example(const Problem::Shape& shape, + const Problem::Axes& axes, + bool time_kernel) { - return run_permute_element(Problem{shape, axes}); + return run_permute_element(Problem{shape, axes}, time_kernel); } diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp index 4573c68658..f9a7d9f638 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp @@ -78,8 +78,28 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + const auto out_element_op = OutElementOp{ActivationOp{}}; - run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op); + run_conv2d_fwd_bias_perchannel_quantization_example( + out_element_op, do_verification, time_kernel); }; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp index 005f6263fd..333987edd6 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp @@ -76,9 +76,28 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + float requant_scale = 0.5f; const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}}; - run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op); + run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel); } diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp index 62e5e583de..4b94045421 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp @@ -79,9 +79,29 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + float scale_z_inv = 0.5f; const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}}; - run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op); + run_conv2d_fwd_bias_perchannel_quantization_example( + out_element_op, do_verification, time_kernel); }; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp index ef98fe7e4f..b74e06b10a 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp @@ -76,10 +76,29 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + float scale_acc = 0.5f; float scale_z_inv = 0.5f; const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}}; - run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op); + run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel); } diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp index e524ddb2b2..c3ac40a1bc 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp @@ -76,8 +76,27 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_perchannel_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + const auto out_element_op = OutElementOp{ActivationOp{}}; - run_conv2d_fwd_perchannel_quantization_example(out_element_op); + run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel); } diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp index d29a3143c0..437fd6f4c2 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp @@ -71,9 +71,28 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_perlayer_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = false; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + float requant_scale = 0.5f; const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}}; - run_conv2d_fwd_perlayer_quantization_example(out_element_op); + run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel); } diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp index 8c0049b0fa..d2af8926a5 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp @@ -82,8 +82,28 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_bias_perchannel_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + const auto out_element_op = OutElementOp{ActivationOp{}}; - run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op); + run_conv2d_fwd_bias_perchannel_quantization_example( + out_element_op, do_verification, time_kernel); }; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp index e18c123f7c..81ce2d57e8 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp @@ -80,9 +80,28 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_bias_perlayer_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + float requant_scale = 0.5f; const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}}; - run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op); + run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel); } diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp index 53f810cc9e..85e3dd9f15 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp @@ -80,8 +80,27 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_perchannel_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + const auto out_element_op = OutElementOp{ActivationOp{}}; - run_conv2d_fwd_perchannel_quantization_example(out_element_op); + run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel); } diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp index 9db6e201dd..2f333b1198 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp @@ -75,9 +75,28 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_perlayer_quantization_example.inc" -int main() +int main(int argc, char* argv[]) { + bool do_verification = true; + bool time_kernel = false; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + float requant_scale = 0.5f; const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}}; - run_conv2d_fwd_perlayer_quantization_example(out_element_op); + run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel); } diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc index e5b924ad51..30e0791ebf 100644 --- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc +++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc @@ -167,10 +167,10 @@ bool run_grouped_conv_fwd(bool do_verification, return (pass ? 0 : 1); } -int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op) +int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op, + bool do_verification, + bool time_kernel) { - bool do_verification = true; - bool time_kernel = true; const ck::index_t ndim_spatial = 2; ck::utils::conv::ConvParam conv_param{ diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc index 9f3a769dcf..32fd435e00 100644 --- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc +++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc @@ -155,10 +155,10 @@ bool run_grouped_conv_fwd(bool do_verification, return (pass ? 0 : 1); } -int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op) +int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op, + bool do_verification, + bool time_kernel) { - bool do_verification = true; - bool time_kernel = true; const ck::index_t ndim_spatial = 2; ck::utils::conv::ConvParam conv_param{ diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc index 9b08fc690d..362d90b4c1 100644 --- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc +++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc @@ -157,10 +157,10 @@ bool run_grouped_conv_fwd(bool do_verification, return (pass ? 0 : 1); } -int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op) +int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op, + bool do_verification, + bool time_kernel) { - bool do_verification = true; - bool time_kernel = true; const ck::index_t ndim_spatial = 2; ck::utils::conv::ConvParam conv_param{ diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc index 267c737e00..eae6e996cc 100644 --- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc +++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc @@ -139,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification, return (pass ? 0 : 1); } -int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op) +int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op, + bool do_verification, + bool time_kernel) { - bool do_verification = true; - bool time_kernel = false; const ck::index_t ndim_spatial = 2; ck::utils::conv::ConvParam conv_param{ diff --git a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc index ab6f317bc6..86e1c8ccc8 100644 --- a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc +++ b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc @@ -11,21 +11,36 @@ int run_groupnorm_fwd_example(int argc, char* argv[]) ck::index_t G = 64; ck::index_t C = 128; + bool do_verification = true; + bool time_kernel = true; + bool log_kernel = true; + if(argc == 1) { // use default case } - else if(argc == 6) + else if(argc == 4) { - N = std::stoi(argv[1]); - H = std::stoi(argv[2]); - W = std::stoi(argv[3]); - G = std::stoi(argv[4]); - C = std::stoi(argv[5]); + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + log_kernel = std::stoi(argv[3]); + } + else if(argc == 9) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + log_kernel = std::stoi(argv[3]); + N = std::stoi(argv[4]); + H = std::stoi(argv[5]); + W = std::stoi(argv[6]); + G = std::stoi(argv[7]); + C = std::stoi(argv[8]); } else { - std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl; + std::cerr << "arg1 = verify(0=no, 1=yes), arg2 = time kernels(0=no, 1=yes), arg3 = log " + "kernels(0=no, 1=yes), arg4 to 8: N, H, W, G, C" + << std::endl; return 1; } @@ -94,7 +109,8 @@ int run_groupnorm_fwd_example(int argc, char* argv[]) device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); auto invoker_ptr = device_instance.MakeInvokerPointer(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true}); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel, log_kernel}); std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C + sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C + @@ -106,6 +122,7 @@ int run_groupnorm_fwd_example(int argc, char* argv[]) << device_instance.GetTypeString() << std::endl; bool pass = true; + if(do_verification) { Tensor host_y({N, H, W, G, C}); Tensor host_save_mean(HostTensorDescriptor{N, G}); diff --git a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp index 8819bb65e6..1564bcb006 100644 --- a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp @@ -44,11 +44,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle ck::Sequence<8, 8>, // InScalarPerVectorSeq ck::Sequence<8>>; // OutScalarPerVectorSeq -int main() +int main(int argc, char* argv[]) { bool do_verification = true; bool time_kernel = true; + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + std::vector nchw = {16, 128, 32, 64}; std::array ab_lengths; std::array ab_strides = {static_cast(nchw[1] * nchw[2] * nchw[3]), diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp index 3ea1aa4bf8..9e92543252 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp @@ -37,11 +37,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle ck::Sequence<8>, // InScalarPerVectorSeq ck::Sequence<8>>; // OutScalarPerVectorSeq -int main() +int main(int argc, char* argv[]) { bool do_verification = true; bool time_kernel = true; + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + std::vector nchw = {16, 128, 32, 64}; std::vector nhwc = {16, 32, 64, 128}; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp index 13c67fce05..88c23b5f40 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp @@ -41,11 +41,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle ck::Sequence<8>, // InScalarPerVectorSeq ck::Sequence<8>>; // OutScalarPerVectorSeq -int main() +int main(int argc, char* argv[]) { bool do_verification = true; bool time_kernel = true; + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + std::vector nchw = {16, 8, 32, 64}; std::vector nhwc = {16, 32, 64, 8}; std::array ab_lengths; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp index 0a0f6fec10..1185b5a3ca 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp @@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle ck::Sequence<8>, // InScalarPerVectorSeq ck::Sequence<8>>; // OutScalarPerVectorSeq -int main() +int main(int argc, char* argv[]) { bool do_verification = true; bool time_kernel = true; + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + std::vector nchw = {16, 128, 32, 64}; std::vector nhwc = {16, 32, 64, 128}; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp index fc664186be..28a3dbc44c 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp @@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle ck::Sequence<1>, // InScalarPerVectorSeq ck::Sequence<1>>; // OutScalarPerVectorSeq -int main() +int main(int argc, char* argv[]) { bool do_verification = true; bool time_kernel = true; + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + std::vector nchw = {16, 8, 32, 64}; std::vector nhwc = {16, 32, 64, 8}; std::array ab_lengths; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp index a0c416318a..14d1d96165 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp @@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle ck::Sequence<8>, // InScalarPerVectorSeq ck::Sequence<8>>; // OutScalarPerVectorSeq -int main() +int main(int argc, char* argv[]) { bool do_verification = true; bool time_kernel = true; + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + std::vector nchw = {16, 128, 32, 64}; std::vector nhwc = {16, 32, 64, 128}; diff --git a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp index c40447e1f9..0619cc7139 100644 --- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp +++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp @@ -119,6 +119,22 @@ int main(int argc, char* argv[]) bool do_verification = true; bool time_kernel = true; + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + const float scale = 2.f; ck::index_t M = 1024; diff --git a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp index 050300eed2..2583f1cb5e 100644 --- a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp @@ -48,11 +48,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle ck::Sequence<8, 8, 8>, // InScalarPerVectorSeq ck::Sequence<8>>; // OutScalarPerVectorSeq -int main() +int main(int argc, char* argv[]) { bool do_verification = true; bool time_kernel = true; + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } + std::vector nchw = {16, 128, 32, 64}; std::array ab_lengths; std::array ab_strides = {static_cast(nchw[1] * nchw[2] * nchw[3]), diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp index c02d540983..51006e676b 100644 --- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp +++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp @@ -77,9 +77,26 @@ void host_elementwise2D(HostTensorC& C, } } -int main() +int main(int argc, char* argv[]) { - bool time_kernel = true; + bool do_verification = true; + bool time_kernel = true; + + if(argc == 1) + { + // use default + } + else if(argc == 3) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: time kernel (0=no, 1=yes)\n"); + exit(0); + } ck::index_t M = 48 * 256; ck::index_t N = 1024; @@ -157,6 +174,7 @@ int main() std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl; bool pass = true; + if(do_verification) { std::vector mn = {static_cast(M), static_cast(N)}; diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp index b6cc3b6543..5129b46231 100644 --- a/example/ck_tile/15_fused_moe/main.cpp +++ b/example/ck_tile/15_fused_moe/main.cpp @@ -1,3 +1,6 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + #include #include #include diff --git a/example/ck_tile/39_copy/test_tile_example.sh b/example/ck_tile/39_copy/test_tile_example.sh index fcd8c8e991..416338fac4 100755 --- a/example/ck_tile/39_copy/test_tile_example.sh +++ b/example/ck_tile/39_copy/test_tile_example.sh @@ -1,4 +1,7 @@ #!/usr/bin/env bash +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + set -euo pipefail BIN="${BIN:-../../../build/bin/tile_example_copy}" diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp index b05145890f..b83d37a790 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp @@ -1,3 +1,6 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + #include "ck_tile/core.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" diff --git a/script/dependency-parser/main.py b/script/dependency-parser/main.py index b8fd67ac49..5c956bca00 100644 --- a/script/dependency-parser/main.py +++ b/script/dependency-parser/main.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + """ Unified CLI for Ninja Dependency Analysis and Selective Testing diff --git a/script/dependency-parser/src/enhanced_ninja_parser.py b/script/dependency-parser/src/enhanced_ninja_parser.py index 087ab50640..725768a61f 100644 --- a/script/dependency-parser/src/enhanced_ninja_parser.py +++ b/script/dependency-parser/src/enhanced_ninja_parser.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + """ Enhanced Ninja Dependency Parser diff --git a/script/dependency-parser/src/selective_test_filter.py b/script/dependency-parser/src/selective_test_filter.py index f364d60d27..e8698d115d 100644 --- a/script/dependency-parser/src/selective_test_filter.py +++ b/script/dependency-parser/src/selective_test_filter.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + """ Selective Test Filter Tool diff --git a/script/gemm_profile.sh b/script/gemm_profile.sh index 487b90d640..89419ca711 100755 --- a/script/gemm_profile.sh +++ b/script/gemm_profile.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT BIN=./bin/tile_example_gemm_weight_preshuffle PREC=fp8 diff --git a/script/launch_tests.sh b/script/launch_tests.sh index 829ac82378..5e71e25478 100755 --- a/script/launch_tests.sh +++ b/script/launch_tests.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT # Get the directory where the script is located BUILD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" diff --git a/script/ninja_json_converter.py b/script/ninja_json_converter.py index 92660dc7b3..7bfb2f867b 100644 --- a/script/ninja_json_converter.py +++ b/script/ninja_json_converter.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT """ Converts .ninja_log files into Chrome's about:tracing format. diff --git a/script/remod_for_ck_tile.sh b/script/remod_for_ck_tile.sh index 5c7a78d0cc..b017d2e1d6 100755 --- a/script/remod_for_ck_tile.sh +++ b/script/remod_for_ck_tile.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT # Get list of staged files STAGED_FILES=$(git diff --cached --name-only) diff --git a/script/remove_exec_bit.sh b/script/remove_exec_bit.sh index 25466d8c37..5b0035c8b8 100755 --- a/script/remove_exec_bit.sh +++ b/script/remove_exec_bit.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT for file in $(git diff --cached --name-only --diff-filter=ACM | grep -E '\.(cpp|hpp|txt|inc)$'); do if [ -x "$file" ]; then diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp index 54410acf70..b3d433c466 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp @@ -1,3 +1,6 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + #include "test_gemm_pipeline_kernel_types.hpp" #include "test_gemm_pipeline_util.hpp" #include "gtest/gtest.h" diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc index 8d21c65692..798bbb1116 100644 --- a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc +++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases_cshuffle.inc @@ -1,3 +1,6 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + #pragma once TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDKBatch1CShuffle_256x512x256) diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp index de71c4682d..d836c501ae 100644 --- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp +++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp @@ -1,3 +1,6 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + #include "test_gemm_pipeline_kernel_types.hpp" #include "test_gemm_pipeline_util.hpp" #include "gtest/gtest.h" diff --git a/test_data/generate_model_configs.py b/test_data/generate_model_configs.py index 125655cef4..f852d781d6 100644 --- a/test_data/generate_model_configs.py +++ b/test_data/generate_model_configs.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + """ Generate Model Configuration Combinations for MIOpen Testing diff --git a/test_data/generate_test_dataset.sh b/test_data/generate_test_dataset.sh index 3fb8fa027b..1124311feb 100755 --- a/test_data/generate_test_dataset.sh +++ b/test_data/generate_test_dataset.sh @@ -1,4 +1,7 @@ #!/bin/bash +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + # Generate Comprehensive Convolution Test Dataset for CK # This script captures MIOpen commands from PyTorch models and generates test cases diff --git a/test_data/miopen_to_csv.py b/test_data/miopen_to_csv.py index ae8c187b43..3292584548 100644 --- a/test_data/miopen_to_csv.py +++ b/test_data/miopen_to_csv.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + """ Convert MIOpen Driver Commands to CSV Test Cases diff --git a/test_data/run_model_with_miopen.py b/test_data/run_model_with_miopen.py index 3d96e19f2f..596f6a4a37 100644 --- a/test_data/run_model_with_miopen.py +++ b/test_data/run_model_with_miopen.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# Copyright © Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + """ PyTorch Model Runner with MIOpen Command Logging using torchvision models