From 04e4ac53bdea7e7352a109d142436096e14807f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Mon, 19 Aug 2024 17:24:56 +0200 Subject: [PATCH] Add script to convert MIOpen driver to ckProfiler (#1472) * Add script to convert MIOpen driver to ckProfiler * Fix [ROCm/composable_kernel commit: a6a796650587d845e28c2c2e99535aa97065d36b] --- .../profile_grouped_conv_bwd_weight_impl.hpp | 169 ++++---- .../src/profile_grouped_conv_bwd_weight.cpp | 3 +- script/convert_miopen_driver_to_profiler.py | 386 ++++++++++++++++++ 3 files changed, 480 insertions(+), 78 deletions(-) create mode 100644 script/convert_miopen_driver_to_profiler.py diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 356aec7a08..5318de5e8b 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -136,9 +136,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, std::cout << "found " << op_ptrs.size() << " instances" << std::endl; std::string best_op_name; - float best_avg_time = 0; - float best_tflops = 0; - float best_gb_per_sec = 0; + float best_avg_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + ck::index_t best_split_k = 1; // profile device Conv instances bool all_pass = true; @@ -167,99 +168,115 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, range_copy(conv_param.input_left_pads_, begin(input_left_pads)); range_copy(conv_param.input_right_pads_, begin(input_right_pads)); + std::vector split_k_list = {1, 2, 4, 8, 16, 32, 64, 128}; + + if(split_k > 0) + { + split_k_list = {split_k}; + } + for(auto& op_ptr : op_ptrs) { - auto argument_ptr = - op_ptr->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - input_lengths, - input_strides, - filter_lengths, - weights_strides, - output_lengths, - output_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - in_element_op, - wei_element_op, - out_element_op, - split_k); - - const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); - DeviceMem workspace_dev(workspace_sz); - op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); - - if(op_ptr->IsSupportedArgument(argument_ptr.get())) + for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++) { - // using atomic add, so need to reset input - wei_device_buf.SetZero(); + auto argument_ptr = op_ptr->MakeArgumentPointer( + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + input_lengths, + input_strides, + filter_lengths, + weights_strides, + output_lengths, + output_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op, + split_k_list[split_k_id]); - std::string op_name = op_ptr->GetTypeString(); + const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + DeviceMem workspace_dev(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); - auto invoker_ptr = op_ptr->MakeInvokerPointer(); - - float avg_time = - invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - std::size_t flop = conv_param.GetFlops(); - std::size_t num_btype = conv_param.GetByte(); - - float tflops = static_cast(flop) / 1.E9 / avg_time; - float gb_per_sec = num_btype / 1.E6 / avg_time; - - std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " - << gb_per_sec << " GB/s, " << op_name << std::endl; - - if(tflops > best_tflops) + if(op_ptr->IsSupportedArgument(argument_ptr.get())) { - best_op_name = op_name; - best_tflops = tflops; - best_avg_time = avg_time; - best_gb_per_sec = gb_per_sec; - } + // using atomic add, so need to reset input + wei_device_buf.SetZero(); - if(do_verification) - { - wei_device_buf.FromDevice(weight_device_result.mData.data()); + std::string op_name = op_ptr->GetTypeString(); - bool pass = ck::utils::check_err(weight_device_result, weight_host_result); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); - if(!pass) + float avg_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " + << split_k_list[split_k_id] << std::endl; + + if(tflops > best_tflops) { - std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl; + best_op_name = op_name; + best_tflops = tflops; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_split_k = split_k_list[split_k_id]; } - all_pass &= pass; - - if(do_log) + if(do_verification) { - LogRangeAsType(std::cout << "output : ", output.mData, ",") << std::endl; - ; - LogRangeAsType( - std::cout << "weight (device): ", weight_device_result.mData, ",") - << std::endl; - ; - LogRangeAsType( - std::cout << "weight (host): ", weight_host_result.mData, ",") - << std::endl; - ; - LogRangeAsType(std::cout << "input: ", input.mData, ",") << std::endl; - ; + wei_device_buf.FromDevice(weight_device_result.mData.data()); + + bool pass = ck::utils::check_err(weight_device_result, weight_host_result); + + if(!pass) + { + std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl; + } + + all_pass &= pass; + + if(do_log) + { + LogRangeAsType(std::cout << "output : ", output.mData, ",") + << std::endl; + ; + LogRangeAsType( + std::cout << "weight (device): ", weight_device_result.mData, ",") + << std::endl; + ; + LogRangeAsType( + std::cout << "weight (host): ", weight_host_result.mData, ",") + << std::endl; + ; + LogRangeAsType(std::cout << "input: ", input.mData, ",") + << std::endl; + ; + } } } - } - else - { - std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" + << std::endl; + } } } std::cout << "Best configuration parameters:" << "\nname: " << best_op_name << "\navg_time: " << best_avg_time - << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; + << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK " + << best_split_k << std::endl; return all_pass; } diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp index 6ed7cf5e48..7dd75a5e0a 100644 --- a/profiler/src/profile_grouped_conv_bwd_weight.cpp +++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -81,7 +81,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv); ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]); - split_k = std::max(1, split_k); using F32 = float; using F16 = ck::half_t; diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py new file mode 100644 index 0000000000..47135f3401 --- /dev/null +++ b/script/convert_miopen_driver_to_profiler.py @@ -0,0 +1,386 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +# Convert miopen driver command to ck Profiler +# Example: python3 ../script/convert_miopen_driver_to_profiler.py +# /opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3 +# -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1 + +import argparse +import subprocess + + +def init_const_args(args): + args.ck_profiler_cmd = '../build/bin/ckProfiler' + # use decimal values + args.init_method = 2 + # don't print tensor values + args.log_value = 0 + + +def run_ck_profiler_cmd(cmd): + print("ckProfiler command:") + print(cmd) + subprocess.run(cmd) + + +def parse_data_type(args): + if args.data_type == "fp32": + if args.ck_profier_op == "grouped_conv_bwd_weight" or \ + args.ck_profier_op == "grouped_conv_bwd_weight" or \ + args.ck_profier_op == "grouped_conv_fwd": + args.data_type = 0 + if args.data_type == "fp16": + if args.ck_profier_op == "grouped_conv_bwd_weight" or \ + args.ck_profier_op == "grouped_conv_bwd_data" or \ + args.ck_profier_op == "grouped_conv_fwd": + args.data_type = 1 + if args.data_type == "int8": + if args.ck_profier_op == "grouped_conv_bwd_weight": + args.data_type = 4 + if args.ck_profier_op == "grouped_conv_bwd_data": + print('Not supported data type for grouped_conv_bwd_data') + exit(1) + if args.ck_profier_op == "grouped_conv_fwd": + args.data_type = 3 + if args.data_type == "bfp16": + if args.ck_profier_op == "grouped_conv_bwd_weight" or \ + args.ck_profier_op == "grouped_conv_bwd_data" or \ + args.ck_profier_op == "grouped_conv_fwd": + args.data_type = 2 + + +def add_conv_params_to_cmd(args, cmd): + if args.spatial_dim == 1: + cmd += [str(args.fil_w), str(args.in_w)] + cmd += [str(args.conv_stride_w), str(args.dilation_w)] + cmd += [str(args.pad_w), str(args.pad_w)] + elif args.spatial_dim == 2: + cmd += [str(args.fil_h), str(args.fil_w)] + cmd += [str(args.in_h), str(args.in_w)] + cmd += [str(args.conv_stride_h), str(args.conv_stride_w)] + cmd += [str(args.dilation_h), str(args.dilation_w)] + cmd += [str(args.pad_h), str(args.pad_w)] + cmd += [str(args.pad_h), str(args.pad_w)] + elif args.spatial_dim == 3: + cmd += [str(args.fil_d), str(args.fil_h), str(args.fil_w)] + cmd += [str(args.in_d), str(args.in_h), str(args.in_w)] + cmd += [str(args.conv_stride_d), str(args.conv_stride_h)] + cmd += [str(args.conv_stride_w)] + cmd += [str(args.dilation_d), + str(args.dilation_h), + str(args.dilation_w)] + cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)] + cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)] + else: + print('Not supported spatial dim (supported: 1, 2, 3)') + exit(1) + + +def run_ck_grouped_conv_fwd(args): + args.ck_profier_op = "grouped_conv_fwd" + parse_data_type(args) + # default for MIOpen NHWGC + args.layout = 1 + # use int32 by default + args.index_type = 0 + + cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)] + cmd += [str(args.data_type), str(args.layout), str(args.index_type)] + cmd += [str(args.verify), str(args.init_method)] + cmd += [str(args.log_value), str(args.time)] + cmd += [str(args.spatial_dim), str(args.group_count)] + cmd += [str(args.batchsize), str(args.out_channels)] + cmd += [str(args.in_channels)] + add_conv_params_to_cmd(args, cmd) + + run_ck_profiler_cmd(cmd) + + +def run_ck_grouped_conv_bwd_data(args): + args.ck_profier_op = "grouped_conv_bwd_data" + parse_data_type(args) + # default for MIOpen NHWGC + args.layout = 1 + + cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)] + cmd += [str(args.data_type), str(args.layout)] + cmd += [str(args.verify), str(args.init_method)] + cmd += [str(args.log_value), str(args.time)] + cmd += [str(args.spatial_dim), str(args.group_count)] + cmd += [str(args.batchsize), str(args.out_channels)] + cmd += [str(args.in_channels)] + add_conv_params_to_cmd(args, cmd) + + run_ck_profiler_cmd(cmd) + + +def run_ck_grouped_conv_bwd_weight(args): + args.ck_profier_op = "grouped_conv_bwd_weight" + parse_data_type(args) + # default for MIOpen NHWGC + args.layout = 2 + # Test all split K value from the list {1, 2, 4, 8, 32, 64, 128} + args.split_k_value = -1 + + cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)] + cmd += [str(args.data_type), str(args.layout)] + cmd += [str(args.verify), str(args.init_method)] + cmd += [str(args.log_value), str(args.time)] + cmd += [str(args.spatial_dim), str(args.group_count)] + cmd += [str(args.batchsize), str(args.out_channels)] + cmd += [str(args.in_channels)] + add_conv_params_to_cmd(args, cmd) + + cmd += [str(args.split_k_value)] + run_ck_profiler_cmd(cmd) + +# Get name of miopen driver, remove it from unknown +def process_miopen_driver_name(args, unknown): + if "convint8" in unknown: + args.data_type = 'int8' + unknown.remove("convint8") + elif "convbfp16" in unknown: + args.data_type = 'bfp16' + unknown.remove("convbfp16") + elif "convfp16" in unknown: + args.data_type = 'fp16' + unknown.remove("convfp16") + elif "conv" in unknown: + args.data_type = 'fp32' + unknown.remove("conv") + else: + print('Not supported driver (supported: conv, convfp16, convint8,' + ' convbfp16).') + exit(1) + + +def run_ck_profiler(args): + # MIOpen get number of channel per all groups, CK profiler get number of + # channel per group + args.in_channels = int(args.in_channels / args.group_count) + args.out_channels = int(args.out_channels / args.group_count) + + if args.forw == 0 or args.forw == 1 or args.forw == 3 or args.forw == 5: + run_ck_grouped_conv_fwd(args) + if args.forw == 0 or args.forw == 2 or args.forw == 3 or args.forw == 6: + run_ck_grouped_conv_bwd_data(args) + if args.forw == 0 or args.forw == 4 or args.forw == 5 or args.forw == 6: + run_ck_grouped_conv_bwd_weight(args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="converter", + description="Convert miopen driver command to ck Profiler" + "\nExample: python3 " + "../script/convert_miopen_driver_to_profiler.py " + "/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 " + "-k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g " + "32 -F 1 -t 1", + ) + parser.add_argument( + "-in_layout", + "-I", + default=-1, + type=int, + required=False, + help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)" + ) + parser.add_argument( + "-forw", + "-F", + default=0, + type=int, + required=False, + help="Flag enables fwd, bwd, wrw convolutions" + "\n0 fwd+bwd+wrw (default)" + "\n1 fwd only" + "\n2 bwd only" + "\n4 wrw only" + "\n3 fwd+bwd" + "\n5 fwd+wrw" + "\n6 bwd+wrw" + ) + parser.add_argument( + "-spatial_dim", + "-_", + default=2, + type=int, + required=False, + help="convolution spatial dimension (Default-2)" + ) + parser.add_argument( + "-batchsize", + "-n", + default=100, + type=int, + required=False, + help="Mini-batch size (Default=100)" + ) + parser.add_argument( + "-in_channels", + "-c", + default=3, + type=int, + required=False, + help="Number of Input Channels (Default=3)" + ) + parser.add_argument( + "-in_d", + "-!", + default=32, + type=int, + required=False, + help="Input Depth (Default=32)" + ) + parser.add_argument( + "-in_h", + "-H", + default=32, + type=int, + required=False, + help="Input Height (Default=32)" + ) + parser.add_argument( + "-in_w", + "-W", + default=32, + type=int, + required=False, + help="Input Width (Default=32)" + ) + parser.add_argument( + "-out_channels", + "-k", + default=32, + type=int, + required=False, + help="Number of Output Channels (Default=32)" + ) + parser.add_argument( + "-fil_d", + "-@", + default=3, + type=int, + required=False, + help="Filter Depth (Default=3)" + ) + parser.add_argument( + "-fil_h", + "-y", + default=3, + type=int, + required=False, + help="Filter Height (Default=3)" + ) + parser.add_argument( + "-fil_w", + "-x", + default=3, + type=int, + required=False, + help="Filter Width (Default=3)" + ) + parser.add_argument( + "-conv_stride_d", + "-#", + default=1, + type=int, + required=False, + help="Convolution Stride for Depth (Default=1)" + ) + parser.add_argument( + "-conv_stride_h", + "-u", + default=1, + type=int, + required=False, + help="Convolution Stride for Height (Default=1)" + ) + parser.add_argument( + "-conv_stride_w", + "-v", + default=1, + type=int, + required=False, + help="Convolution Stride for Width (Default=1)" + ) + parser.add_argument( + "-pad_d", + "-$", + default=1, + type=int, + required=False, + help="Zero Padding for Depth (Default=0)" + ) + parser.add_argument( + "-pad_h", + "-p", + default=1, + type=int, + required=False, + help="Zero Padding for Height (Default=0)" + ) + parser.add_argument( + "-pad_w", + "-q", + default=1, + type=int, + required=False, + help="Zero Padding for Width (Default=0)" + ) + parser.add_argument( + "-verify", + "-V", + default=1, + type=int, + required=False, + help="Verify Each Layer (Default=1)" + ) + parser.add_argument( + "-time", + "-t", + default=0, + type=int, + required=False, + help="Time Each Layer (Default=0)" + ) + parser.add_argument( + "-dilation_d", + "-^", + default=1, + type=int, + required=False, + help="Dilation of Filter Depth (Default=1)" + ) + parser.add_argument( + "-dilation_h", + "-l", + default=1, + type=int, + required=False, + help="Dilation of Filter Height (Default=1)" + ) + parser.add_argument( + "-dilation_w", + "-j", + default=1, + type=int, + required=False, + help="Dilation of Filter Width (Default=1)" + ) + parser.add_argument( + "-group_count", + "-g", + type=int, + default=1, + required=False, + help="Number of Groups (Default=1)" + ) + + args, unknown = parser.parse_known_args() + init_const_args(args) + process_miopen_driver_name(args, unknown) + print("Ignored args:") + print(unknown) + run_ck_profiler(args)