diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index eb8b5c76d3..9e125c4e5d 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -100,13 +100,13 @@ int main(int argc, char* argv[]) const std::array reduceDims = {3, 4}; // const std::array invariantDims = {0, 1, 2}; - const std::vector inLengths_1 = {64, 320, 80, 4, 128}; + std::vector inLengths_1 = {64, 320, 80, 4, 128}; // input lengths of the second reduction, which is also the output lengths of the first // reduction - const std::vector inLengths_2 = {64, 320, 80, 4}; + std::vector inLengths_2 = {64, 320, 80, 4}; - const std::vector outLengths = {64, 320, 80}; + std::vector outLengths = {64, 320, 80}; if(argc == 1) { @@ -114,11 +114,26 @@ int main(int argc, char* argv[]) init_method = 2; time_kernel = true; } - else if(argc == 4) + else if((argc == 4) || (argc == 9)) { do_verify = static_cast(argv[1]); init_method = atoi(argv[2]); time_kernel = static_cast(atoi(argv[3])); + if(argc == 9) + { + inLengths_1[0] = atoi(argv[4]); + inLengths_1[1] = atoi(argv[5]); + inLengths_1[2] = atoi(argv[6]); + inLengths_1[3] = atoi(argv[7]); + inLengths_1[4] = atoi(argv[8]); + inLengths_2[0] = inLengths_1[0]; + inLengths_2[1] = inLengths_1[1]; + inLengths_2[2] = inLengths_1[2]; + inLengths_2[3] = inLengths_1[3]; + outLengths[0] = inLengths_1[0]; + outLengths[1] = inLengths_1[1]; + outLengths[2] = inLengths_1[2]; + } } else { diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp index 26a03f289d..a1b952259f 100644 --- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp +++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp @@ -50,14 +50,14 @@ template<> struct emb_kernel { using kernel_type = DeviceInsta // clang-format on -int main() +int main(int argc, char* argv[]) { bool time_kernel = true; - constexpr auto num_rows = 65536; - constexpr auto dims = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{}; - // constexpr auto dims = ck::Sequence<256, 512>{}; - constexpr auto index_length = 2048; + ck::index_t num_rows = 65536; + constexpr auto dims = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{}; + ck::index_t index_length = 2048; + ck::index_t dim_mask = 0xffff; constexpr AccDataType epsilon = 1e-4; auto f_host_tensor_desc_1d = [](std::size_t len_) { return HostTensorDescriptor({len_}); }; @@ -73,121 +73,140 @@ int main() BetaDataType, AccDataType, OutType>; - + if(argc == 1) + { + // Use default value + } + else if(argc == 4) + { + num_rows = atoi(argv[1]); + dim_mask = strtol(argv[2], nullptr, 0); + index_length = atoi(argv[3]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1-3: num_rows dim_mask index_length" << std::endl; + } ck::static_for<0, dims.Size(), 1>{}([&](auto I) { - std::srand(std::time(nullptr)); - constexpr auto current_dim = dims.At(I); - Tensor emb_a(f_host_tensor_desc_2d(num_rows, current_dim)); - Tensor emb_b(f_host_tensor_desc_2d(num_rows, current_dim)); - Tensor emb_c(f_host_tensor_desc_2d(num_rows, current_dim)); - - Tensor index_a(f_host_tensor_desc_1d(index_length)); - Tensor index_b(f_host_tensor_desc_1d(index_length)); - Tensor index_c(f_host_tensor_desc_1d(index_length)); - - Tensor gamma(f_host_tensor_desc_1d(current_dim)); - Tensor beta(f_host_tensor_desc_1d(current_dim)); - - Tensor out(f_host_tensor_desc_2d(index_length, current_dim)); - - emb_a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - emb_b.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - emb_c.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - - index_a.GenerateTensorValue(GeneratorTensor_2{0, num_rows}); - index_b.GenerateTensorValue(GeneratorTensor_2{0, num_rows}); - index_c.GenerateTensorValue(GeneratorTensor_2{0, num_rows}); - - gamma.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - beta.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - - DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize()); - DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize()); - DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize()); - - DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize()); - DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize()); - DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize()); - - DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); - DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); - - DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize()); - - emb_a_dev.ToDevice(emb_a.mData.data()); - emb_b_dev.ToDevice(emb_b.mData.data()); - emb_c_dev.ToDevice(emb_c.mData.data()); - - index_a_dev.ToDevice(index_a.mData.data()); - index_b_dev.ToDevice(index_b.mData.data()); - index_c_dev.ToDevice(index_c.mData.data()); - - gamma_dev.ToDevice(gamma.mData.data()); - beta_dev.ToDevice(beta.mData.data()); - - auto device_instance = typename emb_kernel::kernel_type{}; - auto argument_ptr = device_instance.MakeArgumentPointer( - out_dev.GetDeviceBuffer(), - {ck::type_convert(emb_a_dev.GetDeviceBuffer()), - ck::type_convert(emb_b_dev.GetDeviceBuffer()), - ck::type_convert(emb_c_dev.GetDeviceBuffer())}, - {ck::type_convert(index_a_dev.GetDeviceBuffer()), - ck::type_convert(index_b_dev.GetDeviceBuffer()), - ck::type_convert(index_c_dev.GetDeviceBuffer())}, - gamma_dev.GetDeviceBuffer(), - beta_dev.GetDeviceBuffer(), - current_dim, - index_length, - epsilon, - EmbElementwiseOperation{}); - std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString() - << std::endl - << std::flush; - - bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get()); - - if(!is_supported) + if(dim_mask & (1 << I.value)) { - std::cout << "Runtime parameters are not supported" << std::endl; - return; + std::srand(std::time(nullptr)); + constexpr auto current_dim = dims.At(I); + Tensor emb_a(f_host_tensor_desc_2d(num_rows, current_dim)); + Tensor emb_b(f_host_tensor_desc_2d(num_rows, current_dim)); + Tensor emb_c(f_host_tensor_desc_2d(num_rows, current_dim)); + + Tensor index_a(f_host_tensor_desc_1d(index_length)); + Tensor index_b(f_host_tensor_desc_1d(index_length)); + Tensor index_c(f_host_tensor_desc_1d(index_length)); + + Tensor gamma(f_host_tensor_desc_1d(current_dim)); + Tensor beta(f_host_tensor_desc_1d(current_dim)); + + Tensor out(f_host_tensor_desc_2d(index_length, current_dim)); + + emb_a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + emb_b.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + emb_c.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + index_a.GenerateTensorValue(GeneratorTensor_2{0, num_rows}); + index_b.GenerateTensorValue(GeneratorTensor_2{0, num_rows}); + index_c.GenerateTensorValue(GeneratorTensor_2{0, num_rows}); + + gamma.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + beta.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize()); + DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize()); + DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize()); + + DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize()); + DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize()); + DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize()); + + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + + DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize()); + + emb_a_dev.ToDevice(emb_a.mData.data()); + emb_b_dev.ToDevice(emb_b.mData.data()); + emb_c_dev.ToDevice(emb_c.mData.data()); + + index_a_dev.ToDevice(index_a.mData.data()); + index_b_dev.ToDevice(index_b.mData.data()); + index_c_dev.ToDevice(index_c.mData.data()); + + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + auto device_instance = typename emb_kernel::kernel_type{}; + auto argument_ptr = device_instance.MakeArgumentPointer( + out_dev.GetDeviceBuffer(), + {ck::type_convert(emb_a_dev.GetDeviceBuffer()), + ck::type_convert(emb_b_dev.GetDeviceBuffer()), + ck::type_convert(emb_c_dev.GetDeviceBuffer())}, + {ck::type_convert(index_a_dev.GetDeviceBuffer()), + ck::type_convert(index_b_dev.GetDeviceBuffer()), + ck::type_convert(index_c_dev.GetDeviceBuffer())}, + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + current_dim, + index_length, + epsilon, + EmbElementwiseOperation{}); + std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString() + << std::endl + << std::flush; + + bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get()); + + if(!is_supported) + { + std::cout << "Runtime parameters are not supported" << std::endl; + return; + } + + auto invoker_ptr = device_instance.MakeInvokerPointer(); + float time_ms = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + bool pass = true; + { + Tensor out_from_dev(f_host_tensor_desc_2d(index_length, current_dim)); + ReferenceInstance ref; + auto ref_argument = ref.MakeArgument(out, + emb_a, + emb_b, + emb_c, + index_a, + index_b, + index_c, + gamma, + beta, + num_rows, + current_dim, + index_length, + epsilon); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + + out_dev.FromDevice(out_from_dev.mData.data()); + pass &= + ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3); + } + + double total_read = current_dim * index_length * 3 * sizeof(EmbType) + + current_dim * sizeof(GammaDataType) + + current_dim * sizeof(BetaDataType); + double total_write = current_dim * index_length * sizeof(OutType); + double gbps = (total_read + total_write) / time_ms / 1e6; + + std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms + << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl + << std::flush; } - - auto invoker_ptr = device_instance.MakeInvokerPointer(); - float time_ms = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - bool pass = true; - { - Tensor out_from_dev(f_host_tensor_desc_2d(index_length, current_dim)); - ReferenceInstance ref; - auto ref_argument = ref.MakeArgument(out, - emb_a, - emb_b, - emb_c, - index_a, - index_b, - index_c, - gamma, - beta, - num_rows, - current_dim, - index_length, - epsilon); - auto ref_invoker = ref.MakeInvoker(); - ref_invoker.Run(ref_argument); - - out_dev.FromDevice(out_from_dev.mData.data()); - pass &= ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3); - } - - double total_read = current_dim * index_length * 3 * sizeof(EmbType) + - current_dim * sizeof(GammaDataType) + - current_dim * sizeof(BetaDataType); - double total_write = current_dim * index_length * sizeof(OutType); - double gbps = (total_read + total_write) / time_ms / 1e6; - - std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms - << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl - << std::flush; }); return 0; diff --git a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp index 31d1bef520..8ddd432c11 100644 --- a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp @@ -68,6 +68,24 @@ int main(int argc, char* argv[]) } std::vector nchw = {16, 128, 32, 64}; + if(argc == 1) + { + // use default case + } + else if(argc == 5) + { + nchw[0] = std::stoi(argv[1]); + nchw[1] = std::stoi(argv[2]); + nchw[2] = std::stoi(argv[3]); + nchw[3] = std::stoi(argv[4]); + } + else + { + std::cerr << "arg1 to 4: N, C, H, W" << std::endl; + + return 1; + } + std::array ab_lengths; std::array ab_strides = {static_cast(nchw[1] * nchw[2] * nchw[3]), static_cast(nchw[2] * nchw[3]), diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp index 51006e676b..8064809123 100644 --- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp +++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -98,8 +98,23 @@ int main(int argc, char* argv[]) exit(0); } - ck::index_t M = 48 * 256; - ck::index_t N = 1024; + ck::index_t M = 48 * 256; + ck::index_t N = 1024; + if(argc == 1) + { + // use default case + } + else if(argc == 3) + { + M = std::stoi(argv[1]); + N = std::stoi(argv[2]); + } + else + { + std::cerr << "arg1 to 2: M, N" << std::endl; + return 1; + } + ck::index_t Stride = N; auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { diff --git a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp index 6cf1b2ff91..dcbb472118 100644 --- a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp +++ b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp @@ -100,7 +100,7 @@ using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizatio 4, // DGammaDstVectorSize 4>; // DBetaDstVectorSize -int main() +int main(int argc, char* argv[]) { bool time_kernel = false; @@ -110,6 +110,25 @@ int main() ck::index_t G = 32; ck::index_t C = 64; + if(argc == 1) + { + // use default case + } + else if(argc == 6) + { + N = std::stoi(argv[1]); + H = std::stoi(argv[2]); + W = std::stoi(argv[3]); + G = std::stoi(argv[4]); + C = std::stoi(argv[5]); + } + else + { + std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl; + + return 1; + } + Tensor dy({N, H, W, G, C}); Tensor x({N, H, W, G, C}); Tensor gamma({G, C}); diff --git a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp index 7cf0fed74f..537a4703d3 100644 --- a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp +++ b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp @@ -39,7 +39,8 @@ bool profile_avg_pool2d_bwd_impl(int do_verification, std::vector window_strides, std::vector window_dilations, std::vector input_left_pads, - std::vector input_right_pads) + std::vector input_right_pads, + index_t instance_index = -1) { constexpr index_t InOutRank = 4; constexpr index_t WindowRank = 2; @@ -166,6 +167,11 @@ bool profile_avg_pool2d_bwd_impl(int do_verification, { ++num_kernel; instance_found = true; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -249,7 +255,11 @@ bool profile_avg_pool2d_bwd_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "avg_pool2d_bwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass && instance_found; } diff --git a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp index fba8f6f67f..c97e42228d 100644 --- a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp +++ b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp @@ -48,7 +48,8 @@ bool profile_avg_pool3d_bwd_impl(int do_verification, std::vector window_strides, std::vector window_dilations, std::vector input_left_pads, - std::vector input_right_pads) + std::vector input_right_pads, + index_t instance_index = -1) { constexpr index_t InOutRank = 5; constexpr index_t WindowRank = 3; @@ -166,6 +167,11 @@ bool profile_avg_pool3d_bwd_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -246,7 +252,11 @@ bool profile_avg_pool3d_bwd_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "avg_pool3d_bwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return true; } diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp index 2f6a50cbd4..ca0d031dba 100644 --- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp @@ -49,10 +49,10 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, int O, int G0, int G1, - float alpha = -1.f) + float alpha = -1.f, + int instance_index = -1) { - using PassThrough = tensor_operation::element_wise::PassThrough; using ScaleAdd = tensor_operation::element_wise::ScaleAdd; using AElementOp = PassThrough; @@ -277,7 +277,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, float best_ave_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; - + int num_kernel = 0; // profile device op instances for(auto& op_ptr : op_ptrs) { @@ -314,6 +314,13 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + std::string op_name = op_ptr->GetTypeString(); float ave_time = @@ -392,6 +399,11 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + if(instance_index != -1) + { + std::cout << "batched_gemm_bias_softmax_gemm_permute_instance (" << instance_index << "/" + << num_kernel << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp index 79ca7029c6..0fdda68c4d 100644 --- a/profiler/include/profiler/profile_batched_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp @@ -47,7 +47,8 @@ bool profile_batched_gemm_impl(int do_verification, int BatchStrideA, int BatchStrideB, int BatchStrideC, - int BatchCount) + int BatchCount, + int instance_index = -1) { bool pass = true; @@ -138,6 +139,7 @@ bool profile_batched_gemm_impl(int do_verification, float best_ave_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; + int num_kernel = 0; // profile device op instances for(auto& op_ptr : op_ptrs) @@ -203,6 +205,12 @@ bool profile_batched_gemm_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } // re-init C to zero before profiling next kernel c_device_buf.SetZero(); @@ -259,6 +267,11 @@ bool profile_batched_gemm_impl(int do_verification, std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + if(instance_index != -1) + { + std::cout << "batched_gemm_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp index 03fa1b1371..183b0e183a 100644 --- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp @@ -40,19 +40,19 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, int N, int K, int O, - int BatchCount = 1, - int StrideA = -1, - int StrideB0 = -1, - int StrideB1 = -1, - int StrideC = -1, - int BatchStrideA = -1, - int BatchStrideB0 = -1, - int BatchStrideB1 = -1, - int BatchStrideC = -1, - float alpha = -1.f) + int BatchCount = 1, + int StrideA = -1, + int StrideB0 = -1, + int StrideB1 = -1, + int StrideC = -1, + int BatchStrideA = -1, + int BatchStrideB0 = -1, + int BatchStrideB1 = -1, + int BatchStrideC = -1, + float alpha = -1.f, + int instance_index = -1) { - using Row = tensor_layout::gemm::RowMajor; using Col = tensor_layout::gemm::ColumnMajor; using PassThrough = tensor_operation::element_wise::PassThrough; @@ -253,7 +253,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, float best_ave_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; - + int num_kernel = 0; // profile device op instances for(auto& op_ptr : op_ptrs) { @@ -285,6 +285,13 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + std::string op_name = op_ptr->GetTypeString(); float ave_time = @@ -341,7 +348,11 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; - + if(instance_index != -1) + { + std::cout << "batched_gemm_softmax_gemm_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp index 2945a4a66d..e953cc4b66 100644 --- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp @@ -48,10 +48,10 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, int O, int G0, int G1, - float alpha = -1.f) + float alpha = -1.f, + int instance_index = -1) { - using PassThrough = tensor_operation::element_wise::PassThrough; using Scale = tensor_operation::element_wise::Scale; using AElementOp = PassThrough; @@ -254,6 +254,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, float best_ave_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; + int num_kernel = 0; // profile device op instances for(auto& op_ptr : op_ptrs) @@ -287,6 +288,13 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + std::string op_name = op_ptr->GetTypeString(); float ave_time = @@ -362,7 +370,11 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; - + if(instance_index != -1) + { + std::cout << "batched_gemm_softmax_gemm_permute_instance (" << instance_index << "/" + << num_kernel << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp index 3343b5e66e..bf5a661407 100644 --- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp +++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp @@ -34,7 +34,8 @@ bool profile_batchnorm_backward_impl(bool do_verification, const std::vector inOutLengths, const std::vector reduceDims, bool haveSavedMeanInvVar, - double epsilon) + double epsilon, + index_t instance_index = -1) { if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim) { @@ -293,6 +294,11 @@ bool profile_batchnorm_backward_impl(bool do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -382,7 +388,11 @@ bool profile_batchnorm_backward_impl(bool do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if (instance_index != -1) + { + std::cout << "batchnorm_backward_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp index 2f9538b16c..078f6bff87 100644 --- a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp +++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp @@ -35,7 +35,8 @@ bool profile_batchnorm_forward_impl(int do_verification, bool updateMovingAverage, bool saveMeanAndInvVariance, double averageFactor, - double epsilon) + double epsilon, + index_t instance_index = -1) { if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim) { @@ -287,6 +288,11 @@ bool profile_batchnorm_forward_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -404,7 +410,11 @@ bool profile_batchnorm_forward_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "batchnorm_forward_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp index 1b31a2aabf..c866b88e8a 100644 --- a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp +++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp @@ -32,7 +32,8 @@ bool profile_batchnorm_infer_impl(int do_verification, bool time_kernel, const std::vector inOutLengths, const std::vector reduceDims, - double epsilon) + double epsilon, + index_t instance_index = -1) { if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim) { @@ -253,6 +254,11 @@ bool profile_batchnorm_infer_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -327,7 +333,11 @@ bool profile_batchnorm_infer_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if (instance_index != -1) + { + std::cout << "batchnorm_infer_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_contraction_impl.hpp b/profiler/include/profiler/profile_contraction_impl.hpp index 616e824ce1..361861a6d1 100644 --- a/profiler/include/profiler/profile_contraction_impl.hpp +++ b/profiler/include/profiler/profile_contraction_impl.hpp @@ -54,7 +54,8 @@ int profile_contraction_impl(ck::index_t do_verification, const std::vector& StridesA, // [M0, M1, K0, K1] const std::vector& StridesB, // [N0, N1, K0, K1] const std::vector& StridesE, // [M0, M1, N0, N1] - const std::vector& StridesD) // [M0, M1, N0, N1] + const std::vector& StridesD, // [M0, M1, N0, N1] + int instance_index = -1) { bool pass = true; @@ -197,7 +198,7 @@ int profile_contraction_impl(ck::index_t do_verification, float best_avg_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; - + int num_kernel = 0; // profile device op instances for(auto& op_ptr : op_ptrs) { @@ -256,6 +257,12 @@ int profile_contraction_impl(ck::index_t do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } // re-init C to zero before profiling next kernel e_device_buf.SetZero(); @@ -376,6 +383,11 @@ int profile_contraction_impl(ck::index_t do_verification, << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + if(instance_index != -1) + { + std::cout << "contraction_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp index 5ea1a78094..8f7adebdd4 100644 --- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp @@ -58,7 +58,8 @@ bool profile_conv_bwd_data_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - const ck::utils::conv::ConvParam& conv_param) + const ck::utils::conv::ConvParam& conv_param, + int instance_index = -1) { using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -174,7 +175,7 @@ bool profile_conv_bwd_data_impl(int do_verification, float best_avg_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; - + int num_kernel = 0; // profile device Conv instances bool pass = true; @@ -200,6 +201,12 @@ bool profile_conv_bwd_data_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } // for conv bwd data, some input tensor element are zero, but not written by kernel, // need to set zero in_device_buf.SetZero(); @@ -263,7 +270,11 @@ bool profile_conv_bwd_data_impl(int do_verification, std::cout << "Best configuration parameters:" << "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; - + if(instance_index != -1) + { + std::cout << "conv_bwd_data_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp index 37366821c4..200409fe61 100644 --- a/profiler/include/profiler/profile_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp @@ -36,7 +36,8 @@ bool profile_conv_fwd_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - const ck::utils::conv::ConvParam& conv_param) + const ck::utils::conv::ConvParam& conv_param, + int instance_index = -1) { using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -156,7 +157,7 @@ bool profile_conv_fwd_impl(int do_verification, float best_avg_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; - + int num_kernel = 0; // profile device op instances bool pass = true; @@ -182,6 +183,12 @@ bool profile_conv_fwd_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } // re-init output to zero before profiling next kernel out_device_buf.SetZero(); @@ -236,7 +243,11 @@ bool profile_conv_fwd_impl(int do_verification, std::cout << "Best configuration parameters:" << "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; - + if(instance_index != -1) + { + std::cout << "conv_fwd_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp index aafb7b260d..171ae1662b 100644 --- a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp +++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp @@ -122,7 +122,8 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - const ck::utils::conv::ConvParam& conv_param) + const ck::utils::conv::ConvParam& conv_param, + index_t instance_index = -1) { const ck::index_t NDoHoWo = conv_param.N_ * @@ -226,7 +227,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, // profile device op instances bool pass = true; bool is_supporting_instance = false; - + index_t num_kernel = 0; for(auto& op_ptr : op_ptrs) { auto argument_ptr = op_ptr->MakeArgumentPointer( @@ -247,6 +248,12 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } is_supporting_instance = true; // re-init output to zero before profiling next kernel out_device_buf.SetZero(); @@ -291,6 +298,11 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, std::cout << "Best configuration parameters:" << "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\nGB/s: " << best_gb_per_sec << std::endl; + if(instance_index != -1) + { + std::cout << "conv_tensor_rearrange_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return is_supporting_instance && pass; } diff --git a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp index 220076465d..ca08f48bcf 100644 --- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp +++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp @@ -49,7 +49,8 @@ bool profile_elementwise_layernorm_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - std::vector length) + std::vector length, + index_t instance_index = -1) { using Add = ck::tensor_operation::element_wise::Add; using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -199,6 +200,11 @@ bool profile_elementwise_layernorm_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -270,6 +276,11 @@ bool profile_elementwise_layernorm_impl(int do_verification, return false; } + if(instance_index != -1) + { + std::cout << "elementwise_layernorm_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return true; } diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp index 470cc86d1b..74a1b60fe3 100644 --- a/profiler/include/profiler/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp @@ -70,7 +70,8 @@ bool profile_gemm_reduce_impl(int do_verification, int K, int StrideA, int StrideB, - int StrideC) + int StrideC, + int instance_index = -1) { bool pass = true; @@ -249,7 +250,7 @@ bool profile_gemm_reduce_impl(int do_verification, float best_ave_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; - + int num_kernel = 0; // profile device GEMM instances for(auto& gemm_ptr : gemm_ptrs) { @@ -275,6 +276,12 @@ bool profile_gemm_reduce_impl(int do_verification, if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } // init DO, D1 to 0 reduce0_device_buf.SetZero(); reduce1_device_buf.SetZero(); @@ -345,7 +352,11 @@ bool profile_gemm_reduce_impl(int do_verification, std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; - + if(instance_index != -1) + { + std::cout << "gemm_reduce_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp index 8032730199..744db27675 100644 --- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp +++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp @@ -44,7 +44,8 @@ bool profile_gemm_splitk_impl(int do_verification, int StrideC, int KBatch, int n_warmup, - int n_iter) + int n_iter, + int instance_index = -1) { bool pass = true; @@ -141,6 +142,7 @@ bool profile_gemm_splitk_impl(int do_verification, float best_tflops = 0; float best_gb_per_sec = 0; float best_kbatch = 0; + int num_kernel = 0; // profile device GEMM instances for(auto& op_ptr : op_ptrs) @@ -175,7 +177,12 @@ bool profile_gemm_splitk_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { - + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } // re-init C to zero before profiling next kernel c_device_buf.SetZero(); @@ -294,7 +301,11 @@ bool profile_gemm_splitk_impl(int do_verification, << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; - + if(instance_index != -1) + { + std::cout << "gemm_splitk_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index 29b2fece6b..a7c6717f58 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -35,7 +35,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, bool do_log, bool time_kernel, const ck::utils::conv::ConvParam& conv_param, - ck::index_t split_k = 1) + ck::index_t split_k = 1, + index_t instance_index = -1) { using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -123,9 +124,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, ck::index_t best_split_k = 1; // profile device op instances - bool pass = true; - - auto run_impl = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) { + bool pass = true; + index_t num_kernel = 0; + auto run_impl = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) { // workspace_sz will be equal to 0 for other layout than NGCHW const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); DeviceMem workspace_dev(workspace_sz); @@ -133,6 +134,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + return; + } std::string op_name = op_ptr->GetTypeString(); auto invoker_ptr = op_ptr->MakeInvokerPointer(); @@ -165,8 +172,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, in_device_buf.FromDevice(in_device.mData.data()); using ComputeType = std::conditional_t; + OutDataType, + WeiDataType>; using AccDataType = std::conditional_t, int32_t, float>; const index_t num_accums = conv_param.K_; @@ -297,6 +304,11 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl; + if(instance_index != -1) + { + std::cout << "grouped_conv_bwd_data_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 479fed78e7..6654275fd0 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -41,7 +41,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, bool do_log, bool time_kernel, const ck::utils::conv::ConvParam& conv_param, - const std::string& split_k) + const std::string& split_k, + index_t instance_index = -1) { using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -187,6 +188,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, } } + index_t num_kernel = 0; for(auto& op_ptr : op_ptrs) { for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++) @@ -226,6 +228,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } std::string op_name = op_ptr->GetTypeString(); @@ -326,6 +334,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl; + if(instance_index != -1) + { + std::cout << "grouped_conv_bwd_weight_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return all_pass; } diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp index 91ac2a0ab6..2f7f3ae4d8 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp @@ -126,7 +126,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - const ck::utils::conv::ConvParam& conv_param) + const ck::utils::conv::ConvParam& conv_param, + int instance_index = -1) { const float floor = 0.f; const float ceil = 2048.f; @@ -295,6 +296,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, float best_avg_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; + int num_kernel = 0; // profile device op instances bool pass = true; @@ -307,6 +309,13 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + std::cout << op_ptr->GetTypeString() << " skipped" << std::endl; + return; + } // re-init output to zero before profiling next kernel out_device_buf.SetZero(); @@ -420,7 +429,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, std::cout << "Best configuration parameters:" << "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; - + if(instance_index != -1) + { + std::cout << "grouped_conv_fwd_bias_bnorm_clamp_instance (" << instance_index << "/" + << num_kernel << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp index 188d7aa0b0..2dbadd8eb1 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp @@ -64,7 +64,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - const ck::utils::conv::ConvParam& conv_param) + const ck::utils::conv::ConvParam& conv_param, + int instance_index = -1) { using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -194,7 +195,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, float best_avg_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; - + int num_kernel = 0; // profile device op instances bool pass = true; @@ -206,6 +207,13 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + std::cout << op_ptr->GetTypeString() << " skipped" << std::endl; + return; + } // re-init output to zero before profiling next kernel out_device_buf.SetZero(); @@ -317,7 +325,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, std::cout << "Best configuration parameters:" << "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; - + if(instance_index != -1) + { + std::cout << "grouped_conv_fwd_bias_clamp_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index 2dcee4c1fc..d490cf4167 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -42,7 +42,8 @@ bool profile_grouped_conv_fwd_impl(int do_verification, bool do_log, bool time_kernel, const ck::utils::conv::ConvParam& conv_param, - const OutElementOp out_element_op = OutElementOp{}) + const OutElementOp out_element_op = OutElementOp{}, + index_t instance_index = -1) { using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -144,7 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, float best_avg_time = 0; float best_tflops = 0; float best_gb_per_sec = 0; - + index_t num_kernel = 0; // profile device op instances bool pass = true; @@ -156,6 +157,13 @@ bool profile_grouped_conv_fwd_impl(int do_verification, if(op_ptr->IsSupportedArgument(argument_ptr.get())) { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + return; + } + std::string op_name = op_ptr->GetTypeString(); auto invoker_ptr = op_ptr->MakeInvokerPointer(); @@ -253,7 +261,11 @@ bool profile_grouped_conv_fwd_impl(int do_verification, std::cout << "Best configuration parameters:" << "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; - + if(instance_index != -1) + { + std::cout << "grouped_conv_fwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp index eef5e02911..8314b9053f 100644 --- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp @@ -44,7 +44,8 @@ bool profile_grouped_gemm_impl(int do_verification, const std::vector& StrideCs, const std::vector& kbatches = {}, int n_warmup = 1, - int n_iter = 10) + int n_iter = 10, + int instance_index = -1) { bool pass = true; // TODO: Fixme - we do not pass compute data type here but need it @@ -195,8 +196,8 @@ bool profile_grouped_gemm_impl(int do_verification, float best_tflops = 0; float best_gb_per_sec = 0; float best_kbatch = 0; - - auto p_ds = std::vector>{}; + int num_kernel = 0; + auto p_ds = std::vector>{}; if(do_verification) { @@ -279,6 +280,13 @@ bool profile_grouped_gemm_impl(int do_verification, if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) { + ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + for(std::size_t i = 0; i < gemm_descs.size(); i++) c_device_buf[i]->SetZero(); @@ -371,7 +379,11 @@ bool profile_grouped_gemm_impl(int do_verification, << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch << std::endl; } - + if(instance_index != -1) + { + std::cout << "grouped_gemm_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } return pass; } diff --git a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp index 55ea08e0db..c1647815ad 100644 --- a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp @@ -26,7 +26,8 @@ bool profile_groupnorm_bwd_data_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - std::vector length) + std::vector length, + index_t instance_index = -1) { // we don't need DGamma and DBeta here, just for reference class using DGammaDataType = DXDataType; @@ -162,6 +163,11 @@ bool profile_groupnorm_bwd_data_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -242,7 +248,11 @@ bool profile_groupnorm_bwd_data_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "groupnorm_bwd_data_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return true; } diff --git a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp index d0a5032bff..60982d18d5 100644 --- a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp @@ -29,7 +29,8 @@ bool profile_groupnorm_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - std::vector length) + std::vector length, + index_t instance_index = -1) { using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -178,6 +179,11 @@ bool profile_groupnorm_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -267,6 +273,12 @@ bool profile_groupnorm_impl(int do_verification, return false; } + if(instance_index != -1) + { + std::cout << "groupnorm_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } + return true; } diff --git a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp index e88a06122d..7704085048 100644 --- a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp @@ -27,7 +27,8 @@ bool profile_layernorm_bwd_data_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - std::vector length) + std::vector length, + index_t instance_index = -1) { // we don't need DGamma and DBeta here, just for reference class using DGammaDataType = DXDataType; @@ -167,6 +168,11 @@ bool profile_layernorm_bwd_data_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -247,7 +253,11 @@ bool profile_layernorm_bwd_data_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "layernorm_bwd_data_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return true; } diff --git a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp index 10fa9c86d5..e36b20e1b5 100644 --- a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp @@ -27,7 +27,8 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - std::vector length) + std::vector length, + index_t instance_index = -1) { // we don't need GammaDataType and DXDataType here, just for reference class using GammaDataType = DYDataType; @@ -178,6 +179,11 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -255,7 +261,11 @@ bool profile_layernorm_bwd_gamma_beta_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "layernorm_bwd_gamma_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return true; } diff --git a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp index 66272b6eff..51dcbb1275 100644 --- a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp @@ -28,7 +28,8 @@ bool profile_layernorm_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - std::vector length) + std::vector length, + index_t instance_index = -1) { using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -188,6 +189,11 @@ bool profile_layernorm_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -286,6 +292,12 @@ bool profile_layernorm_impl(int do_verification, return false; } + if(instance_index != -1) + { + std::cout << "layernorm_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } + return true; } diff --git a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp index 6e3de3a26a..a8efee3ef0 100644 --- a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp +++ b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp @@ -34,7 +34,8 @@ bool profile_max_pool2d_bwd_impl(int do_verification, std::vector window_strides, std::vector window_dilations, std::vector input_left_pads, - std::vector input_right_pads) + std::vector input_right_pads, + index_t instance_index = -1) { // AtomicAdd only support f32 for now. ComputeDataType must be float32 using ComputeDataType = float; @@ -199,6 +200,11 @@ bool profile_max_pool2d_bwd_impl(int do_verification, { ++num_kernel; instance_found = true; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -289,7 +295,11 @@ bool profile_max_pool2d_bwd_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "max_pool2d_bwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return pass && instance_found; } diff --git a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp index 407337f827..cf6050969f 100644 --- a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp +++ b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp @@ -34,7 +34,8 @@ bool profile_max_pool3d_bwd_impl(int do_verification, std::vector window_strides, std::vector window_dilations, std::vector input_left_pads, - std::vector input_right_pads) + std::vector input_right_pads, + index_t instance_index = -1) { // AtomicAdd only support f32 for now. ComputeDataType must be float32 using ComputeDataType = float; @@ -193,6 +194,11 @@ bool profile_max_pool3d_bwd_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -281,7 +287,11 @@ bool profile_max_pool3d_bwd_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "max_pool3d_bwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return true; } diff --git a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp index 88162b9417..962be4448c 100644 --- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp +++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp @@ -35,7 +35,8 @@ bool profile_pool2d_fwd_impl(int do_verification, std::vector window_strides, std::vector window_dilations, std::vector input_left_pads, - std::vector input_right_pads) + std::vector input_right_pads, + index_t instance_index = -1) { constexpr index_t InOutRank = 4; constexpr index_t WindowRank = 2; @@ -171,6 +172,11 @@ bool profile_pool2d_fwd_impl(int do_verification, if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -268,7 +274,11 @@ bool profile_pool2d_fwd_impl(int do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "max_pool2d_fwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return true; } diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp index 412946d558..e1d0c1573d 100644 --- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp +++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp @@ -46,7 +46,9 @@ template -bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params) +bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, + PoolFwdKernelParams& kernel_params, + index_t instance_index = -1) { constexpr index_t InOutRank = 5; constexpr index_t WindowRank = 3; @@ -199,6 +201,11 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& if(inst_ptr->IsSupportedArgument(argument_ptr.get())) { ++num_kernel; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } } else { @@ -328,7 +335,11 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& std::cout << "Error: No kernel is applicable" << std::endl; return false; } - + if(instance_index != -1) + { + std::cout << "max_pool3d_fwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } return true; } diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp index b54aa65aef..14a93af69d 100644 --- a/profiler/include/profiler/profile_reduce_impl.hpp +++ b/profiler/include/profiler/profile_reduce_impl.hpp @@ -144,7 +144,8 @@ bool profile_reduce_impl_impl(bool do_verification, const std::vector& inLengths, const std::array& reduceDims, float alpha, - float beta) + float beta, + index_t instance_index = -1) { using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device::instance; @@ -373,7 +374,14 @@ bool profile_reduce_impl_impl(bool do_verification, if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) continue; else + { num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + } std::string reduce_name = reduce_ptr->GetTypeString(); @@ -452,7 +460,11 @@ bool profile_reduce_impl_impl(bool do_verification, std::cout << "Error: No kernel is applicable" << std::endl; return false; }; - + if(instance_index != -1) + { + std::cout << "reduce_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } return pass; }; @@ -467,7 +479,8 @@ bool profile_reduce_impl(bool do_verification, bool PropagateNan, bool UseIndex, float alpha, - float beta) + float beta, + index_t instance_index = -1) { bool matched = false; bool pass = true; @@ -505,7 +518,8 @@ bool profile_reduce_impl(bool do_verification, inLengths, arrReduceDims, alpha, - beta); + beta, + instance_index); matched = true; }); diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp index 83913d8398..d7a790803a 100644 --- a/profiler/include/profiler/profile_softmax_impl.hpp +++ b/profiler/include/profiler/profile_softmax_impl.hpp @@ -53,7 +53,8 @@ bool profile_softmax_impl(int do_verification, std::vector in_strides, std::vector reduce_dims, double alpha, - double beta) + double beta, + index_t instance_index = -1) { if(Rank != in_length.size()) { @@ -124,7 +125,7 @@ bool profile_softmax_impl(int do_verification, float best_avg_time = std::numeric_limits::max(); float best_gb_per_sec = 0; std::vector instance_pass; - + index_t num_kernel = 0; for(auto& inst_ptr : instances) { auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths, @@ -146,6 +147,15 @@ bool profile_softmax_impl(int do_verification, instance_pass.push_back(true); continue; } + else + { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + } out_dev.ToDevice(prior_out.data()); auto invoker_ptr = inst_ptr->MakeInvokerPointer(); @@ -216,6 +226,11 @@ bool profile_softmax_impl(int do_verification, std::cout << "alpha = " << alpha << ", " << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; } + if(instance_index != -1) + { + std::cout << "reduce_instance (" << instance_index << "/" << num_kernel << "): Passed" + << std::endl; + } return std::all_of( std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; }); } diff --git a/test/batched_gemm/test_batched_gemm_wmma.cpp b/test/batched_gemm/test_batched_gemm_wmma.cpp index 18f9db8c39..fc190bed85 100644 --- a/test/batched_gemm/test_batched_gemm_wmma.cpp +++ b/test/batched_gemm/test_batched_gemm_wmma.cpp @@ -12,7 +12,8 @@ #include "profiler/profile_batched_gemm_impl.hpp" #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp" - +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; struct GemmParams { ck::index_t M; @@ -37,96 +38,153 @@ class TestBatchedGemm : public ::testing::Test using namespace ck::tensor_operation::device; bool pass = true; - for(auto& param : params) + for(size_t i = 0; i < params.size(); i++) { + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& param = params[i]; const auto M = param.M; const auto N = param.N; const auto K = param.K; const auto BatchCount = param.BatchCount; - pass = - pass && ck::profiler::profile_batched_gemm_impl>( - true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount); + pass = pass && ck::profiler::profile_batched_gemm_impl>( + true, + 1, + false, + 1, + M, + N, + K, + K, + N, + N, + M * K, + K * N, + M * N, + BatchCount, + instance_index); - pass = - pass && ck::profiler::profile_batched_gemm_impl>( - true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount); + pass = pass && ck::profiler::profile_batched_gemm_impl>( + true, + 1, + false, + 1, + M, + N, + K, + K, + K, + N, + M * K, + K * N, + M * N, + BatchCount, + instance_index); - pass = - pass && ck::profiler::profile_batched_gemm_impl>( - true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount); + pass = pass && ck::profiler::profile_batched_gemm_impl>( + true, + 1, + false, + 1, + M, + N, + K, + M, + N, + N, + M * K, + K * N, + M * N, + BatchCount, + instance_index); - pass = - pass && ck::profiler::profile_batched_gemm_impl>( - true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount); + pass = pass && ck::profiler::profile_batched_gemm_impl>( + true, + 1, + false, + 1, + M, + N, + K, + M, + K, + N, + M * K, + K * N, + M * N, + BatchCount, + instance_index); } EXPECT_TRUE(pass); } @@ -191,3 +249,20 @@ TEST_F(TestBatchedGemm, fp16) // this->template Run(); // } // #endif + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batched_gemm/test_batched_gemm_xdl.cpp b/test/batched_gemm/test_batched_gemm_xdl.cpp index f9bb626ce5..3b7c392004 100644 --- a/test/batched_gemm/test_batched_gemm_xdl.cpp +++ b/test/batched_gemm/test_batched_gemm_xdl.cpp @@ -13,6 +13,9 @@ #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp" +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; + struct GemmParams { ck::index_t M; @@ -37,96 +40,153 @@ class TestBatchedGemm : public ::testing::Test using namespace ck::tensor_operation::device; bool pass = true; - for(auto& param : params) + for(size_t i = 0; i < params.size(); i++) { + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& param = params[i]; const auto M = param.M; const auto N = param.N; const auto K = param.K; const auto BatchCount = param.BatchCount; - pass = - pass && ck::profiler::profile_batched_gemm_impl>( - true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount); + pass = pass && ck::profiler::profile_batched_gemm_impl>( + true, + 1, + false, + 1, + M, + N, + K, + K, + N, + N, + M * K, + K * N, + M * N, + BatchCount, + instance_index); - pass = - pass && ck::profiler::profile_batched_gemm_impl>( - true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount); + pass = pass && ck::profiler::profile_batched_gemm_impl>( + true, + 1, + false, + 1, + M, + N, + K, + K, + K, + N, + M * K, + K * N, + M * N, + BatchCount, + instance_index); - pass = - pass && ck::profiler::profile_batched_gemm_impl>( - true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount); + pass = pass && ck::profiler::profile_batched_gemm_impl>( + true, + 1, + false, + 1, + M, + N, + K, + M, + N, + N, + M * K, + K * N, + M * N, + BatchCount, + instance_index); - pass = - pass && ck::profiler::profile_batched_gemm_impl>( - true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount); + pass = pass && ck::profiler::profile_batched_gemm_impl>( + true, + 1, + false, + 1, + M, + N, + K, + M, + K, + N, + M * K, + K * N, + M * N, + BatchCount, + instance_index); } EXPECT_TRUE(pass); } @@ -183,3 +243,20 @@ TEST_F(TestBatchedGemm, fp32) this->template Run(); } #endif + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp index eba461a420..5e250bc356 100644 --- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp +++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp @@ -7,6 +7,8 @@ #include "profiler/profile_batched_gemm_impl.hpp" #include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp" +static ck::index_t instance_index = -1; + namespace { using F16 = ck::half_t; @@ -70,7 +72,8 @@ class TestBatchedGemmMultiD : public ::testing::Test M * K, K * N, M * N, - BatchCount); + BatchCount, + instance_index); EXPECT_TRUE(pass); } }; @@ -88,3 +91,18 @@ TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run(); } #ifdef CK_ENABLE_INT8 TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run(); } #endif +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 2) + { + instance_index = atoi(argv[1]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1: instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp index cb46a995c6..1ab29f251a 100644 --- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp +++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16_xdl.cpp @@ -4,6 +4,9 @@ #include "gtest/gtest.h" #include "test_batched_gemm_softmax_gemm_util.hpp" +ck::index_t param_mask = 0xffff; +ck::index_t instance_index = -1; + template class TestBatchedGemmSoftmaxGemmFP16 : public TestBatchedGemmSoftmaxGemm { @@ -174,3 +177,20 @@ TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, AdhocTest) }; this->Run(); } + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp index 2611f91e66..8074d8a311 100644 --- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp +++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp @@ -9,6 +9,9 @@ #include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp" using ck::tensor_operation::device::GemmSpecialization; +extern ck::index_t param_mask; +extern ck::index_t instance_index; + template using I = ck::Number; @@ -57,15 +60,38 @@ struct TestBatchedGemmSoftmaxGemm : public ::testing::Test B1Layout, CLayout, MaskingType::value>( - verify_, 1, false, bench_, M, N, K, O, BatchCount); + verify_, + 1, + false, + bench_, + M, + N, + K, + O, + BatchCount, + -1, // StrideA + -1, // StrideB0 + -1, // StrideB1 + -1, // StrideC + -1, // BatchStrideA + -1, // BatchStrideB0 + -1, // BatchStrideB1 + -1, // BatchStrideC + -1, // alpha + instance_index); EXPECT_TRUE(pass); } void Run() { - for(auto lengths : this->lengths_) + for(size_t i = 0; i < this->lengths_.size(); i++) { + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& lengths = this->lengths_[i]; int M = lengths[0]; int N = lengths[1]; int K = lengths[2]; diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp index ef88ce6d81..9ce603c575 100644 --- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp +++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_bf16_xdl.cpp @@ -4,6 +4,8 @@ #include "gtest/gtest.h" #include "test_batched_gemm_bias_softmax_gemm_permute_util.hpp" +ck::index_t param_mask = 0xffff; +ck::index_t instance_index = -1; template class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16 : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute @@ -180,3 +182,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest) }; this->Run(); } + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp index b38b10d195..40ce64837d 100644 --- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp +++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_fp16_xdl.cpp @@ -4,6 +4,8 @@ #include "gtest/gtest.h" #include "test_batched_gemm_softmax_gemm_permute_util.hpp" +ck::index_t param_mask = 0xffff; +ck::index_t instance_index = -1; template class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16 : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute @@ -180,3 +182,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest) }; this->Run(); } + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp index eda74819e9..e37cadd0c5 100644 --- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp +++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp @@ -10,7 +10,8 @@ #include "profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp" #include - +extern ck::index_t param_mask; +extern ck::index_t instance_index; using ck::tensor_operation::device::GemmSpecialization; using ck::tensor_operation::device::MaskingSpecialization; using ck::tensor_operation::device::TensorSpecialization; @@ -66,21 +67,26 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test Acc0BiasDataType, Acc1BiasDataType, MaskingType::value>( - verify_, 2, false, bench_, M, N, K, O, G0, G1); + verify_, 2, false, bench_, M, N, K, O, G0, G1, -1, instance_index); EXPECT_TRUE(pass); } void Run() { - for(auto lengths : this->lengths_) + for(size_t i = 0; i < this->lengths_.size(); i++) { - int M = lengths[0]; - int N = lengths[1]; - int K = lengths[2]; - int O = lengths[3]; - int G0 = lengths[4]; - int G1 = lengths[5]; + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& lengths = this->lengths_[i]; + int M = lengths[0]; + int N = lengths[1]; + int K = lengths[2]; + int O = lengths[3]; + int G0 = lengths[4]; + int G1 = lengths[5]; this->RunSingle(M, N, K, O, G0, G1); } diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp index 8d894576c4..b75b7e43cf 100644 --- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp +++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp @@ -5,6 +5,8 @@ #include "test_batched_gemm_softmax_gemm_permute_util.hpp" #include "test_batched_gemm_device_utils.hpp" +ck::index_t param_mask = 0xffff; +ck::index_t instance_index = -1; template class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16 : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute @@ -228,3 +230,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest) }; this->Run(); } + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp index 3a86736f44..61baa50cd7 100644 --- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp +++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp @@ -5,6 +5,9 @@ #include "test_batched_gemm_softmax_gemm_permute_util.hpp" #include "test_batched_gemm_device_utils.hpp" +ck::index_t param_mask = 0xffff; +ck::index_t instance_index = -1; + template class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16 : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute @@ -191,3 +194,20 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest) }; this->Run(); } + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp index d9177ff0f2..13d2e0f0a2 100644 --- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp +++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp @@ -9,6 +9,8 @@ #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" #include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp" +extern ck::index_t param_mask; +extern ck::index_t instance_index; using ck::tensor_operation::device::GemmSpecialization; using ck::tensor_operation::device::MaskingSpecialization; using ck::tensor_operation::device::TensorSpecialization; @@ -64,21 +66,26 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test ck::Tuple<>, ck::Tuple<>, MaskingType::value>( - verify_, 2, false, bench_, M, N, K, O, G0, G1); + verify_, 2, false, bench_, M, N, K, O, G0, G1, -1, instance_index); EXPECT_TRUE(pass); } void Run() { - for(auto lengths : this->lengths_) + for(size_t i = 0; i < this->lengths_.size(); i++) { - int M = lengths[0]; - int N = lengths[1]; - int K = lengths[2]; - int O = lengths[3]; - int G0 = lengths[4]; - int G1 = lengths[5]; + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& lengths = this->lengths_[i]; + int M = lengths[0]; + int N = lengths[1]; + int K = lengths[2]; + int O = lengths[3]; + int G0 = lengths[4]; + int G1 = lengths[5]; this->RunSingle(M, N, K, O, G0, G1); } diff --git a/test/batchnorm/batchnorm_bwd_rank_4.cpp b/test/batchnorm/batchnorm_bwd_rank_4.cpp index cc514261e6..66908360d1 100644 --- a/test/batchnorm/batchnorm_bwd_rank_4.cpp +++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp @@ -15,6 +15,9 @@ using F32 = float; using BF16 = ck::bhalf_t; using F64 = double; +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; + template class TestBatchNormBwdRank4 : public ::testing::Test { @@ -37,33 +40,48 @@ class TestBatchNormBwdRank4 : public ::testing::Test template void Run() { - for(auto& inOutLengths : list_of_lengths) + for(size_t i = 0; i < list_of_lengths.size(); i++) { - bool pass = true; + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& inOutLengths = list_of_lengths[i]; + bool pass = true; EXPECT_FALSE(reduceDims.size() != NumReduceDim); - pass = pass && ck::profiler::profile_batchnorm_backward_impl( - true, 3, false, false, inOutLengths, reduceDims, true, epsilon); + pass = + pass && + ck::profiler::profile_batchnorm_backward_impl( + true, 3, false, false, inOutLengths, reduceDims, true, epsilon, instance_index); - pass = pass && ck::profiler::profile_batchnorm_backward_impl( - true, 3, false, false, inOutLengths, reduceDims, false, epsilon); + pass = + pass && ck::profiler::profile_batchnorm_backward_impl(true, + 3, + false, + false, + inOutLengths, + reduceDims, + false, + epsilon, + instance_index); EXPECT_TRUE(pass); } @@ -103,3 +121,19 @@ TYPED_TEST(TestBatchNormBwdRank4, nchw) this->reduceDims = {0, 2, 3}; this->template Run<3>(); } +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batchnorm/batchnorm_fwd_rank_4.cpp b/test/batchnorm/batchnorm_fwd_rank_4.cpp index 6bf635f0cd..8d81a3892c 100644 --- a/test/batchnorm/batchnorm_fwd_rank_4.cpp +++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp @@ -16,6 +16,9 @@ using BF16 = ck::bhalf_t; using I8 = int8_t; using F64 = double; +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; + template class TestBatchNormFwdRank4 : public ::testing::Test { @@ -38,9 +41,14 @@ class TestBatchNormFwdRank4 : public ::testing::Test template void Run() { - for(auto& inOutLengths : list_of_lengths) + for(size_t i = 0; i < list_of_lengths.size(); i++) { - bool pass = true; + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& inOutLengths = list_of_lengths[i]; + bool pass = true; EXPECT_FALSE(reduceDims.size() != NumReduceDim); @@ -61,7 +69,8 @@ class TestBatchNormFwdRank4 : public ::testing::Test true, true, epsilon, - averageFactor); + averageFactor, + instance_index); pass = pass && ck::profiler::profile_batchnorm_forward_implreduceDims = {0, 2, 3}; this->template Run<3>(); } +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/batchnorm/batchnorm_infer_rank_4.cpp b/test/batchnorm/batchnorm_infer_rank_4.cpp index 0165192acf..41c9cdb94e 100644 --- a/test/batchnorm/batchnorm_infer_rank_4.cpp +++ b/test/batchnorm/batchnorm_infer_rank_4.cpp @@ -10,6 +10,9 @@ #include "profiler/profile_batchnorm_infer_impl.hpp" +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; + using F16 = ck::half_t; using F32 = float; using BF16 = ck::bhalf_t; @@ -36,31 +39,38 @@ class TestBatchNormInferRank4 : public ::testing::Test template void Run() { - for(auto& inOutLengths : list_of_lengths) + for(size_t i = 0; i < list_of_lengths.size(); i++) { - bool pass = true; + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& inOutLengths = list_of_lengths[i]; + bool pass = true; EXPECT_FALSE(reduceDims.size() != NumReduceDim); - pass = pass && ck::profiler::profile_batchnorm_infer_impl( - true, 3, false, false, inOutLengths, reduceDims, epsilon); + pass = pass && + ck::profiler::profile_batchnorm_infer_impl( + true, 3, false, false, inOutLengths, reduceDims, epsilon, instance_index); - pass = pass && ck::profiler::profile_batchnorm_infer_impl( - true, 3, false, false, inOutLengths, reduceDims, epsilon); + pass = pass && + ck::profiler::profile_batchnorm_infer_impl( + true, 3, false, false, inOutLengths, reduceDims, epsilon, instance_index); EXPECT_TRUE(pass); } @@ -100,3 +110,20 @@ TYPED_TEST(TestBatchNormInferRank4, nchw) this->reduceDims = {0, 2, 3}; this->template Run<3>(); } + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/contraction/test_contraction_xdl.cpp b/test/contraction/test_contraction_xdl.cpp index 2bfd5a6a66..3a65b57b0e 100644 --- a/test/contraction/test_contraction_xdl.cpp +++ b/test/contraction/test_contraction_xdl.cpp @@ -12,10 +12,11 @@ #include "profiler/profile_contraction_impl.hpp" #include "profiler/profile_contraction_utils.hpp" -using F16 = ck::half_t; -using BF16 = ck::bhalf_t; -using F32 = float; -using F64 = double; +static ck::index_t instance_index = -1; +using F16 = ck::half_t; +using BF16 = ck::bhalf_t; +using F32 = float; +using F64 = double; using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -95,7 +96,8 @@ class TestContraction : public ::testing::Test StridesA, StridesB, StridesC, - StridesD); + StridesD, + instance_index); EXPECT_TRUE(pass); } } @@ -219,3 +221,18 @@ TYPED_TEST(TestContractionScaleMixedPrecision, scale) this->template Run<2>({{8, 16}, {1, 1}, {8, 16}}); this->template Run<2>({{1, 1}, {1, 1}, {1, 1}}); } +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 2) + { + instance_index = atoi(argv[1]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1: instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp index 5cb8731b26..8904b58d8d 100644 --- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp +++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp @@ -11,6 +11,9 @@ #include "profiler/profile_conv_tensor_rearrange_impl.hpp" +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; + template class TestConvTensorRearrange : public ::testing::Test { @@ -25,18 +28,24 @@ class TestConvTensorRearrange : public ::testing::Test { EXPECT_FALSE(conv_params.empty()); bool pass = true; - for(auto& param : conv_params) + for(size_t i = 0; i < conv_params.size(); i++) { - pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl( + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& param = conv_params[i]; + pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl( true, // do_verification 1, // init_method: integer value false, // do_log false, // time_kernel - param); + param, + instance_index); } EXPECT_TRUE(pass); } @@ -157,3 +166,19 @@ TYPED_TEST(TestConvTensorRearrange3d, Test3D) this->template Run<3, int8_t, int8_t>(); #endif } +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp b/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp index 9d2b6cf577..5ad4f63d30 100644 --- a/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp +++ b/test/convnd_bwd_data/convnd_bwd_data_xdl.cpp @@ -9,7 +9,8 @@ #include #include "profiler/profile_conv_bwd_data_impl.hpp" - +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; template class TestConvndBwdData : public ::testing::Test { @@ -20,10 +21,15 @@ class TestConvndBwdData : public ::testing::Test template void Run() { - for(auto& param : conv_params) + EXPECT_FALSE(conv_params.empty()); + for(size_t i = 0; i < conv_params.size(); i++) { + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& param = conv_params[i]; bool pass; - EXPECT_FALSE(conv_params.empty()); pass = ck::profiler::profile_conv_bwd_data_impl< NDimSpatial, ck::tuple_element_ttemplate Run<3>(); } +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/convnd_fwd/convnd_fwd_xdl.cpp b/test/convnd_fwd/convnd_fwd_xdl.cpp index fe8798ceb8..6d507211ce 100644 --- a/test/convnd_fwd/convnd_fwd_xdl.cpp +++ b/test/convnd_fwd/convnd_fwd_xdl.cpp @@ -10,6 +10,8 @@ #include "profiler/profile_conv_fwd_impl.hpp" +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; template class TestConvndFwd : public ::testing::Test { @@ -20,10 +22,15 @@ class TestConvndFwd : public ::testing::Test template void Run() { - for(auto& param : conv_params) + EXPECT_FALSE(conv_params.empty()); + for(size_t i = 0; i < conv_params.size(); i++) { + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& param = conv_params[i]; bool pass; - EXPECT_FALSE(conv_params.empty()); pass = ck::profiler::profile_conv_fwd_impl< NDimSpatial, ck::tuple_element_ttemplate Run<3>(); } +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp index d5ce77dc2b..43192ed139 100644 --- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp +++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp @@ -8,6 +8,9 @@ using F16 = ck::half_t; using F32 = float; using ck::index_t; +static ck::index_t param_mask = 0xffff; +static ck::index_t instance_index = -1; + template class TestElementwiseLayernorm : public ::testing::Test { @@ -25,15 +28,20 @@ class TestElementwiseLayernorm : public ::testing::Test std::vector> lengths = { {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}, {4096, 8192}}; - for(auto length : lengths) + for(size_t i = 0; i < lengths.size(); i++) { + if((param_mask & (1 << i)) == 0) + { + continue; + } + auto& length = lengths[i]; bool success = ck::profiler::profile_elementwise_layernorm_impl( - true, 2, false, false, length); + true, 2, false, false, length, instance_index); EXPECT_TRUE(success); } } @@ -45,3 +53,19 @@ using KernelTypes = ::testing::Types< TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes); TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); } +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + param_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp index cde5c45aea..d06735a097 100644 --- a/test/gemm/gemm_bf16.cpp +++ b/test/gemm/gemm_bf16.cpp @@ -31,4 +31,4 @@ using AccDataType = float; #include "run_gemm_test.inc" -int main() { return run_gemm_test(); } +int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); } diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp index cad250c6fb..185412ab65 100644 --- a/test/gemm/gemm_fp16.cpp +++ b/test/gemm/gemm_fp16.cpp @@ -31,4 +31,4 @@ using AccDataType = float; #include "run_gemm_test.inc" -int main() { return run_gemm_test(); } +int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); } diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp index c35aa77ea7..cf2d0bd01d 100644 --- a/test/gemm/gemm_fp32.cpp +++ b/test/gemm/gemm_fp32.cpp @@ -31,4 +31,4 @@ using AccDataType = float; #include "run_gemm_test.inc" -int main() { return run_gemm_test(); } +int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); } diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp index e67c8ba4f3..7bf89d9c20 100644 --- a/test/gemm/gemm_fp64.cpp +++ b/test/gemm/gemm_fp64.cpp @@ -31,4 +31,4 @@ using AccDataType = double; #include "run_gemm_test.inc" -int main() { return run_gemm_test(); } +int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); } diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp index 6ece05e306..f1a19dd61a 100644 --- a/test/gemm/gemm_int8.cpp +++ b/test/gemm/gemm_int8.cpp @@ -31,4 +31,4 @@ using AccDataType = int32_t; #include "run_gemm_test.inc" -int main() { return run_gemm_test(); } +int main(int argc, char* argv[]) { return run_gemm_test(argc, argv); } diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/test/gemm/gemm_standalone_xdl_fp16.cpp index 201a49dcd3..bee2d1ec80 100644 --- a/test/gemm/gemm_standalone_xdl_fp16.cpp +++ b/test/gemm/gemm_standalone_xdl_fp16.cpp @@ -105,6 +105,7 @@ int main(int argc, char* argv[]) bool do_verification = true; bool time_kernel = true; + int problem_index = -1; if(argc == 1) { @@ -115,16 +116,28 @@ int main(int argc, char* argv[]) do_verification = std::stoi(argv[1]); time_kernel = std::stoi(argv[2]); } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + time_kernel = std::stoi(argv[2]); + problem_index = std::stoi(argv[3]); + } else { std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl - << "arg2: time kernel (0=no, 1=yes)" << std::endl; + << "arg2: time kernel (0=no, 1=yes)" << std::endl + << "arg3: problem index (0-35, -1 means all)" << std::endl; return 0; } bool pass = true; - for(auto& p : problems) + for(size_t i = 0; i < problems.size(); i++) { + if(problem_index != -1 && problem_index != static_cast(i)) + { + continue; + } + auto& p = problems[i]; GemmParams& problem_size = std::get<0>(p); const LayoutConfig& layout_config = std::get<1>(p); const auto& factory = std::get<2>(p); diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 6c46f4ee89..043eca0e83 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -261,6 +261,44 @@ struct TestGemm return true; } } + + template