From a56a2b20cd99274dbb5d849afd99252cd761813c Mon Sep 17 00:00:00 2001 From: Qun Lin Date: Tue, 27 May 2025 11:15:55 +0800 Subject: [PATCH] [CK] Add test script for all ckProfiler ops 1. Add test scritp for all ckProfiler ops 2. Extend script run_full_performance_tests.sh to inlcude all ops. --- script/profile_avg_pool2d_bwd.sh | 15 + script/profile_avg_pool3d_bwd.sh | 15 + script/profile_batched_gemm_b_scale.sh | 20 + script/profile_batched_gemm_gemm.sh | 17 + script/profile_batched_gemm_reduce.sh | 37 ++ script/profile_bnorm.sh | 13 + script/profile_bnorm_fwd.sh | 14 + script/profile_contraction_bilinear.sh | 15 + script/profile_contraction_scale.sh | 15 + script/profile_conv.sh | 34 ++ script/profile_conv_fwd_bias_relu_add.sh | 19 + script/profile_conv_tensor_rearrange.sh | 21 + script/profile_gemm_b_scale.sh | 18 + script/profile_gemm_d0_d1_e.sh | 19 + script/profile_gemm_d0_e.sh | 19 + script/profile_gemm_universal_batched.sh | 36 ++ script/profile_gemm_universal_streamk.sh | 40 ++ script/profile_grouped_conv_bwd_data.sh | 42 +- script/profile_grouped_gemm_fixed_nk.sh | 17 + script/profile_groupnorm.sh | 15 + script/profile_layernorm.sh | 15 + script/profile_max_pool2d_bwd.sh | 14 + script/profile_max_pool2d_fwd.sh | 16 + script/profile_max_pool3d.sh | 14 + script/profile_pool3d_fwd.sh | 17 + script/profile_softmax.sh | 18 + script/profile_transpose.sh | 13 + script/run_full_performance_tests.sh | 496 ++++++++++++++++++++++- 28 files changed, 1013 insertions(+), 31 deletions(-) create mode 100755 script/profile_avg_pool2d_bwd.sh create mode 100755 script/profile_avg_pool3d_bwd.sh create mode 100755 script/profile_batched_gemm_b_scale.sh create mode 100755 script/profile_batched_gemm_gemm.sh create mode 100755 script/profile_batched_gemm_reduce.sh create mode 100755 script/profile_bnorm.sh create mode 100755 script/profile_bnorm_fwd.sh create mode 100755 script/profile_contraction_bilinear.sh create mode 100755 script/profile_contraction_scale.sh create mode 100755 script/profile_conv.sh create mode 100755 script/profile_conv_fwd_bias_relu_add.sh create mode 100755 script/profile_conv_tensor_rearrange.sh create mode 100755 script/profile_gemm_b_scale.sh create mode 100755 script/profile_gemm_d0_d1_e.sh create mode 100755 script/profile_gemm_d0_e.sh create mode 100755 script/profile_gemm_universal_batched.sh create mode 100755 script/profile_gemm_universal_streamk.sh create mode 100755 script/profile_grouped_gemm_fixed_nk.sh create mode 100755 script/profile_groupnorm.sh create mode 100755 script/profile_layernorm.sh create mode 100755 script/profile_max_pool2d_bwd.sh create mode 100755 script/profile_max_pool2d_fwd.sh create mode 100755 script/profile_max_pool3d.sh create mode 100755 script/profile_pool3d_fwd.sh create mode 100755 script/profile_softmax.sh create mode 100755 script/profile_transpose.sh diff --git a/script/profile_avg_pool2d_bwd.sh b/script/profile_avg_pool2d_bwd.sh new file mode 100755 index 0000000000..5272b38314 --- /dev/null +++ b/script/profile_avg_pool2d_bwd.sh @@ -0,0 +1,15 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time length(NCHW) window size(YX) stride dilation left pad right pad +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 30 30 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 --dmmy 28 29 30 31 32 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 64 256 64 64 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 --dmmy 28 29 30 31 32 + diff --git a/script/profile_avg_pool3d_bwd.sh b/script/profile_avg_pool3d_bwd.sh new file mode 100755 index 0000000000..062d00f184 --- /dev/null +++ b/script/profile_avg_pool3d_bwd.sh @@ -0,0 +1,15 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time length(NCDHW) window size(YX) stride dilation left pad right pad +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 30 30 30 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 64 64 64 64 64 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1 + diff --git a/script/profile_batched_gemm_b_scale.sh b/script/profile_batched_gemm_b_scale.sh new file mode 100755 index 0000000000..2eae5fa030 --- /dev/null +++ b/script/profile_batched_gemm_b_scale.sh @@ -0,0 +1,20 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" + +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +######## op datatype layout B_block_tile verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount KSplit +$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 8 1 +$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 8 1 +$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 4 1 +$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 2 1 + +##todo: B_block_tile is incorect profile_batched_gemm_b_scale diff --git a/script/profile_batched_gemm_gemm.sh b/script/profile_batched_gemm_gemm.sh new file mode 100755 index 0000000000..3df2b947d0 --- /dev/null +++ b/script/profile_batched_gemm_gemm.sh @@ -0,0 +1,17 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +######## op datatype layout verify init log time M___ N___ K___ O__ BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 1024 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 2048 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 4096 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 8192 2 \ No newline at end of file diff --git a/script/profile_batched_gemm_reduce.sh b/script/profile_batched_gemm_reduce.sh new file mode 100755 index 0000000000..ce78db1f8c --- /dev/null +++ b/script/profile_batched_gemm_reduce.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" + +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 2 + + ####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4096 4096 4096 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8192 8192 8192 2 + + ####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 2 + + ####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 2 diff --git a/script/profile_bnorm.sh b/script/profile_bnorm.sh new file mode 100755 index 0000000000..09a261ccb6 --- /dev/null +++ b/script/profile_bnorm.sh @@ -0,0 +1,13 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY="-v $3" +INIT=$4 +TIME=$5 +USE=$6 + +######## op datatype UseSavedMean init time inOutLengths(nhwc) reduceDims verify + $DRIVER $OP $DATATYPE $USE $INIT $TIME -D 64,64,280,82 -R 1,2,3 $VERIFY \ No newline at end of file diff --git a/script/profile_bnorm_fwd.sh b/script/profile_bnorm_fwd.sh new file mode 100755 index 0000000000..008a137bfc --- /dev/null +++ b/script/profile_bnorm_fwd.sh @@ -0,0 +1,14 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY="-v $3" +INIT=$4 +TIME=$5 +USE=$6 + +######## op datatype updateMovingAverage saveMeanAndInvVariance init time inOutLengths(nhwc) reduceDims verify +$DRIVER $OP $DATATYPE $USE 0 $INIT $TIME -D 64,64,280,82 -R 1,2,3 $VERIFY +$DRIVER $OP $DATATYPE $USE 1 $INIT $TIME -D 64,64,280,82 -R 1,2,3 $VERIFY \ No newline at end of file diff --git a/script/profile_contraction_bilinear.sh b/script/profile_contraction_bilinear.sh new file mode 100755 index 0000000000..40bd5211c9 --- /dev/null +++ b/script/profile_contraction_bilinear.sh @@ -0,0 +1,15 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + + +######## op datatype compute_datatype num_dim layout verify init log time alpha beta M0 M1 N0 N1 K0 K1 + $DRIVER $OP $DATATYPE $DATATYPE 2 $LAYOUT $VERIFY $INIT $LOG $TIME 1.0 1.0 128 128 128 128 128 128 diff --git a/script/profile_contraction_scale.sh b/script/profile_contraction_scale.sh new file mode 100755 index 0000000000..27a8c0825c --- /dev/null +++ b/script/profile_contraction_scale.sh @@ -0,0 +1,15 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +######## op datatype compute_datatype num_dim layout verify init log time alpha M0 M1 N0 N1 K0 K1 + $DRIVER $OP $DATATYPE $DATATYPE 2 $LAYOUT $VERIFY $INIT $LOG $TIME 1.0 128 128 128 128 128 128 + diff --git a/script/profile_conv.sh b/script/profile_conv.sh new file mode 100755 index 0000000000..bc1afee723 --- /dev/null +++ b/script/profile_conv.sh @@ -0,0 +1,34 @@ +#!/bin/bash +## GPU visibility +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +N=$8 + +######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 diff --git a/script/profile_conv_fwd_bias_relu_add.sh b/script/profile_conv_fwd_bias_relu_add.sh new file mode 100755 index 0000000000..9c3ebcfffc --- /dev/null +++ b/script/profile_conv_fwd_bias_relu_add.sh @@ -0,0 +1,19 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +N=$8 +######## op datatype layout layout layout verify init log time N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads + $DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 diff --git a/script/profile_conv_tensor_rearrange.sh b/script/profile_conv_tensor_rearrange.sh new file mode 100755 index 0000000000..db4a0d9345 --- /dev/null +++ b/script/profile_conv_tensor_rearrange.sh @@ -0,0 +1,21 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +OPTYPE=$8 +N=$9 +######## op datatype layout verify init log time op_type Dim G N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 + diff --git a/script/profile_gemm_b_scale.sh b/script/profile_gemm_b_scale.sh new file mode 100755 index 0000000000..f99bd05b80 --- /dev/null +++ b/script/profile_gemm_b_scale.sh @@ -0,0 +1,18 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 +KBatch=$8 + +######## op datatype layout B_block_tile verify init log time M___ N___ K___ StrideA StrideB StrideC KBatch + $DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 $KBatch diff --git a/script/profile_gemm_d0_d1_e.sh b/script/profile_gemm_d0_d1_e.sh new file mode 100755 index 0000000000..ee2fdff763 --- /dev/null +++ b/script/profile_gemm_d0_d1_e.sh @@ -0,0 +1,19 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD0 StrideD1 StrideE + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 1408 1024 -1 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 2816 2048 -1 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 1408 2048 -1 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 2816 2048 -1 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 5120 5632 4096 -1 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7040 8192 8192 -1 -1 -1 -1 -1 \ No newline at end of file diff --git a/script/profile_gemm_d0_e.sh b/script/profile_gemm_d0_e.sh new file mode 100755 index 0000000000..68f3ceb55a --- /dev/null +++ b/script/profile_gemm_d0_e.sh @@ -0,0 +1,19 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD0 StrideE + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 1408 1024 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 2816 2048 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 1408 2048 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 2816 2048 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 5120 5632 4096 -1 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7040 8192 8192 -1 -1 -1 -1 diff --git a/script/profile_gemm_universal_batched.sh b/script/profile_gemm_universal_batched.sh new file mode 100755 index 0000000000..59407b890c --- /dev/null +++ b/script/profile_gemm_universal_batched.sh @@ -0,0 +1,36 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 +KBatch=$8 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 -1 -1 -1 8 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 -1 -1 -1 8 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 -1 -1 -1 4 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 -1 -1 -1 2 $KBatch + + ####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024 -1 -1 -1 8 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048 -1 -1 -1 8 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4096 4096 4096 -1 -1 -1 4 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8192 8192 8192 -1 -1 -1 2 $KBatch + + ####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 -1 -1 -1 8 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 -1 -1 -1 8 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 -1 -1 -1 4 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 -1 -1 -1 2 $KBatch + + ####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 -1 -1 -1 8 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 -1 -1 -1 8 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 -1 -1 -1 4 $KBatch + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 -1 -1 -1 2 $KBatch diff --git a/script/profile_gemm_universal_streamk.sh b/script/profile_gemm_universal_streamk.sh new file mode 100755 index 0000000000..2a49bc2166 --- /dev/null +++ b/script/profile_gemm_universal_streamk.sh @@ -0,0 +1,40 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 +STRATEGY=$8 + + +# 120 CU +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC STRATEGY_ GridSize + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 $STRATEGY -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 2048 2048 -1 -1 -1 $STRATEGY -1 + +# 104 CU +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC STRATEGY_ + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 832 1024 1024 -1 -1 -1 $STRATEGY -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 832 2048 2048 -1 -1 -1 $STRATEGY -1 + +# 110 CU +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC STRATEGY_ + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 1408 1024 -1 -1 -1 $STRATEGY -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 2816 2048 -1 -1 -1 $STRATEGY -1 + +# testing different strides +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC STRATEGY_ + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024 $STRATEGY -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048 $STRATEGY -1 + + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 $STRATEGY -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 $STRATEGY -1 + + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 $STRATEGY -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 $STRATEGY -1 + diff --git a/script/profile_grouped_conv_bwd_data.sh b/script/profile_grouped_conv_bwd_data.sh index a1d2f450c9..ac9f1af348 100755 --- a/script/profile_grouped_conv_bwd_data.sh +++ b/script/profile_grouped_conv_bwd_data.sh @@ -15,24 +15,24 @@ TIME=$7 N=$8 # Resnet50 -######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 - $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 +######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads KSplit + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 1 diff --git a/script/profile_grouped_gemm_fixed_nk.sh b/script/profile_grouped_gemm_fixed_nk.sh new file mode 100755 index 0000000000..d67bc57d34 --- /dev/null +++ b/script/profile_grouped_gemm_fixed_nk.sh @@ -0,0 +1,17 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +######## op datatype layout verify init log time Ms______________ Ns______________ Ks_____________ StrideAs___________ StrideBs__________ StrideCs___________ + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960,960,960,960 1024,1024,1024,1024 1024,1024,1024,1024 960,960,960,960 1024,1024,1024,1024 1024,1024,1024,1024 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960,960,960,960 2048,2048,2048,2048 2048,2048,2048,2048 960,960,960,960 2048,2048,2048,2048 2048,2048,2048,2048 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840,3840,3840,3840 4096,4096,4096,4096 4096,4096,4096,4096 3840,3840,3840,3840 4096,4096,4096,4096 4096,4096,4096,4096 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680,7680,7680,7680 8192,8192,8192,8192 8192,8192,8192,8192 7680,7680,7680,7680 8192,8192,8192,8192 8192,8192,8192,8192 diff --git a/script/profile_groupnorm.sh b/script/profile_groupnorm.sh new file mode 100755 index 0000000000..6586caec40 --- /dev/null +++ b/script/profile_groupnorm.sh @@ -0,0 +1,15 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time length +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 1 16 16 32 40 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 64 64 64 64 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 64 64 128 128 \ No newline at end of file diff --git a/script/profile_layernorm.sh b/script/profile_layernorm.sh new file mode 100755 index 0000000000..0e368b9451 --- /dev/null +++ b/script/profile_layernorm.sh @@ -0,0 +1,15 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time length +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 256 256 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 1024 1024 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 4096 4096 diff --git a/script/profile_max_pool2d_bwd.sh b/script/profile_max_pool2d_bwd.sh new file mode 100755 index 0000000000..587a8ab350 --- /dev/null +++ b/script/profile_max_pool2d_bwd.sh @@ -0,0 +1,14 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time length(NCHW) window size(YX) stride dilation left pad right pad +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 30 30 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 --dmmy 28 29 30 31 32 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 64 32 256 256 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 --dmmy 28 29 30 31 32 diff --git a/script/profile_max_pool2d_fwd.sh b/script/profile_max_pool2d_fwd.sh new file mode 100755 index 0000000000..9e7f8293ba --- /dev/null +++ b/script/profile_max_pool2d_fwd.sh @@ -0,0 +1,16 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time return_idx length(NCHW) window size(YX) stride dilation left pad right pad +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 0 --length 2 32 30 30 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 0 --length 64 32 256 256 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 1 --length 2 32 30 30 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 1 --length 64 32 256 256 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 \ No newline at end of file diff --git a/script/profile_max_pool3d.sh b/script/profile_max_pool3d.sh new file mode 100755 index 0000000000..2c8644e2a6 --- /dev/null +++ b/script/profile_max_pool3d.sh @@ -0,0 +1,14 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time length(NCDHW) window size(YX) stride dilation left pad right pad +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 30 30 30 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 8 16 32 256 256 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1 diff --git a/script/profile_pool3d_fwd.sh b/script/profile_pool3d_fwd.sh new file mode 100755 index 0000000000..7fa102394b --- /dev/null +++ b/script/profile_pool3d_fwd.sh @@ -0,0 +1,17 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 +REDUCEOP=$7 + +######## op datatype verify init log time return_index reduce_op length(NCDHW) window size(YX) stride dilation left pad right pad +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 0 $REDUCEOP --length 2 32 30 30 30 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 0 $REDUCEOP --length 8 16 32 256 256 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 1 $REDUCEOP --length 2 32 30 30 30 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 1 $REDUCEOP --length 8 16 32 256 256 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1 \ No newline at end of file diff --git a/script/profile_softmax.sh b/script/profile_softmax.sh new file mode 100755 index 0000000000..f292860357 --- /dev/null +++ b/script/profile_softmax.sh @@ -0,0 +1,18 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time length stride reduce +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 8 4 256 --stride 1024 256 1 --reduce 2 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 8 128 1024 --stride 2097152 1048576 131072 1 --reduce 2 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 8 128 1024 --stride 2097152 1048576 131072 1 --reduce 3 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 512 4096 --stride 134217728 67108864 2097152 1 --reduce 2 +$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 512 4096 --stride 134217728 67108864 2097152 1 --reduce 3 + diff --git a/script/profile_transpose.sh b/script/profile_transpose.sh new file mode 100755 index 0000000000..b6d3941269 --- /dev/null +++ b/script/profile_transpose.sh @@ -0,0 +1,13 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +VERIFY=$3 +INIT=$4 +LOG=$5 +TIME=$6 + +######## op datatype verify init log time N C D H W + $DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 4 8 8 512 512 diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh index ddc5c270b8..ee63bb57a5 100755 --- a/script/run_full_performance_tests.sh +++ b/script/run_full_performance_tests.sh @@ -12,6 +12,8 @@ # environment tag : a string describing the specifics of your test environment # branch name : name of the branch in git repo (git status | grep -e 'On branch') # node name : $hostname +# extended op = 0 : do not include extended op (default) +# = 1 : include extended op #get the command line arguments: export verify=$1 @@ -22,16 +24,19 @@ export branch=$3 echo 'Branch name: ' $branch export host_name=$4 echo 'Host name: ' $host_name +export inlcude_extended_op=$5 +echo 'Inlcude extended op: ' $inlcude_extended_op + function print_log_header(){ - rm -f $1; - echo 'On branch ' $3 &> $1; - echo 'Node name: ' $4 >> $1; - #get GPU_arch and number of compute units from rocminfo - echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; - rocminfo | grep "Compute Unit:" >> $1; - hipcc --version | grep -e 'HIP version' >> $1; - echo 'Environment type: ' $2 >> $1; - /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; + rm -f $1; + echo 'On branch ' $3 &> $1; + echo 'Node name: ' $4 >> $1; + #get GPU_arch and number of compute units from rocminfo + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; + rocminfo | grep "Compute Unit:" >> $1; + hipcc --version | grep -e 'HIP version' >> $1; + echo 'Environment type: ' $2 >> $1; + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; } #run gemm tests @@ -103,6 +108,9 @@ print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name ./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log ./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log ./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log +./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log +./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log +./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log #run grouped_bwd_weight tests export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight.log" @@ -148,4 +156,472 @@ print_log_header $onnx_log $env_type $branch $host_name export mixed_gemm_log="perf_mixed_gemm.log" print_log_header $mixed_gemm_log $env_type $branch $host_name ./profile_mixed_gemm.sh gemm_splitk 4 0 $verify 2 0 1 16 2>&1 | tee -a $mixed_gemm_log -./profile_mixed_gemm.sh gemm_splitk 5 0 $verify 2 0 1 16 2>&1 | tee -a $mixed_gemm_log \ No newline at end of file +./profile_mixed_gemm.sh gemm_splitk 5 0 $verify 2 0 1 16 2>&1 | tee -a $mixed_gemm_log + +if [ "$inlcude_extended_op" = "1" ]; then + #run batched_gemm_add_relu_gemm_add tests + export batched_gemm_add_relu_gemm_add_log="perf_batched_gemm_add_relu_gemm_add.log" + print_log_header $batched_gemm_add_relu_gemm_add_log $env_type $branch $host_name + ./profile_batched_gemm_gemm.sh batched_gemm_add_relu_gemm_add 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_add_relu_gemm_add_log + ./profile_batched_gemm_gemm.sh batched_gemm_add_relu_gemm_add 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_add_relu_gemm_add_log + + #run batched_gemm_b_scale tests + export batched_gemm_b_scale_log="perf_batched_gemm_b_scale.log" + print_log_header $batched_gemm_b_scale_log $env_type $branch $host_name + ./profile_batched_gemm_b_scale.sh batched_gemm_b_scale 8 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_b_scale_log + + #run batched_gemm_gemm tests + export batched_gemm_gemm_log="perf_batched_gemm_gemm.log" + print_log_header $batched_gemm_gemm_log $env_type $branch $host_name + ./profile_batched_gemm_gemm.sh batched_gemm_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_gemm_log + ./profile_batched_gemm_gemm.sh batched_gemm_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_gemm_log + + #run batched_gemm_multi_d tests + export batched_gemm_multi_d_log="perf_batched_gemm_multi_d.log" + print_log_header $batched_gemm_multi_d_log $env_type $branch $host_name + ./profile_batched_gemm.sh batched_gemm_multi_d 0 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log + ./profile_batched_gemm.sh batched_gemm_multi_d 0 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log + ./profile_batched_gemm.sh batched_gemm_multi_d 0 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log + ./profile_batched_gemm.sh batched_gemm_multi_d 0 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log + ./profile_batched_gemm.sh batched_gemm_multi_d 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log + ./profile_batched_gemm.sh batched_gemm_multi_d 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log + ./profile_batched_gemm.sh batched_gemm_multi_d 1 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log + ./profile_batched_gemm.sh batched_gemm_multi_d 1 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log + + #run batched_gemm_reduce tests + export batched_gemm_reduce_log="perf_batched_gemm_reduce.log" + print_log_header $batched_gemm_reduce_log $env_type $branch $host_name + ./profile_batched_gemm_reduce.sh batched_gemm_reduce 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_reduce_log + ./profile_batched_gemm_reduce.sh batched_gemm_reduce 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_reduce_log + ./profile_batched_gemm_reduce.sh batched_gemm_reduce 1 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_reduce_log + ./profile_batched_gemm_reduce.sh batched_gemm_reduce 1 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_reduce_log + + #run contraction_bilinear tests + export contraction_bilinear_log="perf_contraction_bilinear.log" + print_log_header $contraction_bilinear_log $env_type $branch $host_name + ./profile_contraction_bilinear.sh contraction_bilinear 0 0 $verify 1 0 1 2>&1 | tee -a $contraction_bilinear_log + ./profile_contraction_bilinear.sh contraction_bilinear 1 0 $verify 1 0 1 2>&1 | tee -a $contraction_bilinear_log + + #run contraction_scale tests + export contraction_scale_log="perf_contraction_scale.log" + print_log_header $contraction_scale_log $env_type $branch $host_name + ./profile_contraction_scale.sh contraction_scale 0 0 $verify 1 0 1 2>&1 | tee -a $contraction_scale_log + ./profile_contraction_scale.sh contraction_scale 1 0 $verify 1 0 1 2>&1 | tee -a $contraction_scale_log + + #run conv_bwd_data tests + export conv_bwd_data_log="perf_conv_bwd_data.log" + print_log_header $conv_bwd_data_log $env_type $branch $host_name + ./profile_conv.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log + ./profile_conv.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log + ./profile_conv.sh conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log + ./profile_conv.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log + + #run conv_fwd_bias_relu_add tests + export conv_fwd_bias_relu_add_log="perf_conv_fwd_bias_relu_add.log" + print_log_header $conv_fwd_bias_relu_add_log $env_type $branch $host_name + ./profile_conv_fwd_bias_relu_add.sh conv_fwd_bias_relu_add 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_bias_relu_add_log + + #run conv_fwd tests + export conv_fwd_log="perf_conv_fwd.log" + print_log_header $conv_fwd_log $env_type $branch $host_name + ./profile_conv.sh conv_fwd 0 0 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log + ./profile_conv.sh conv_fwd 1 0 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log + ./profile_conv.sh conv_fwd 2 0 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log + ./profile_conv.sh conv_fwd 3 0 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log + ./profile_conv.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log + ./profile_conv.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log + ./profile_conv.sh conv_fwd 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log + ./profile_conv.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log + + #run conv_tensor_rearrange tests + export conv_tensor_rearrange_log="perf_conv_tensor_rearrange.log" + print_log_header $conv_tensor_rearrange_log $env_type $branch $host_name + ./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 0 0 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log + ./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 1 0 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log + ./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 2 0 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log + ./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 3 0 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log + ./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 1 1 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log + ./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 3 1 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log + ./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 1 0 $verify 1 0 1 1 256 2>&1 | tee -a $conv_tensor_rearrange_log + ./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 1 1 $verify 1 0 1 1 256 2>&1 | tee -a $conv_tensor_rearrange_log + + #run gemm_ab_scale tests + export gemm_ab_scale_log="perf_gemm_ab_scale.log" + print_log_header $gemm_ab_scale_log $env_type $branch $host_name + ./profile_gemm_b_scale.sh gemm_ab_scale 7 1 $verify 1 0 1 2>&1 | tee -a $gemm_ab_scale_log + + #run gemm_add_add_fastgelu tests + export gemm_add_add_fastgelu_log="perf_gemm_add_add_fastgelu.log" + print_log_header $gemm_add_add_fastgelu_log $env_type $branch $host_name + ./profile_gemm_d0_d1_e.sh gemm_add_add_fastgelu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_add_fastgelu_log + ./profile_gemm_d0_d1_e.sh gemm_add_add_fastgelu 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_add_fastgelu_log + + #run gemm_add_fastgelu tests + export gemm_add_fastgelu_log="perf_gemm_add_fastgelu.log" + print_log_header $gemm_add_fastgelu_log $env_type $branch $host_name + ./profile_gemm_d0_e.sh gemm_add_fastgelu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_fastgelu_log + ./profile_gemm_d0_e.sh gemm_add_fastgelu 2 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_fastgelu_log + ./profile_gemm_d0_e.sh gemm_add_fastgelu 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_fastgelu_log + ./profile_gemm_d0_e.sh gemm_add_fastgelu 2 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_fastgelu_log + + #run gemm_add_multiply tests + export gemm_add_multiply_log="perf_gemm_add_multiply.log" + print_log_header $gemm_add_multiply_log $env_type $branch $host_name + ./profile_gemm_d0_d1_e.sh gemm_add_multiply 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_multiply_log + ./profile_gemm_d0_d1_e.sh gemm_add_multiply 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_multiply_log + + #run gemm_add_relu_add_layernorm tests + export gemm_add_relu_add_layernorm_log="perf_gemm_add_relu_add_layernorm.log" + print_log_header $gemm_add_relu_add_layernorm_log $env_type $branch $host_name + ./profile_gemm_d0_d1_e.sh gemm_add_relu_add_layernorm 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_relu_add_layernorm_log + ./profile_gemm_d0_d1_e.sh gemm_add_relu_add_layernorm 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_relu_add_layernorm_log + + #run gemm_add_relu tests + export gemm_add_relu_log="perf_gemm_add_relu.log" + print_log_header $gemm_add_relu_log $env_type $branch $host_name + ./profile_gemm_d0_e.sh gemm_add_relu 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_relu_log + ./profile_gemm_d0_e.sh gemm_add_relu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_relu_log + + #run gemm_add_silu tests + export gemm_add_silu_log="perf_gemm_add_silu.log" + print_log_header $gemm_add_silu_log $env_type $branch $host_name + ./profile_gemm_d0_e.sh gemm_add_silu 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_silu_log + ./profile_gemm_d0_e.sh gemm_add_silu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_silu_log + ./profile_gemm_d0_e.sh gemm_add_silu 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_silu_log + ./profile_gemm_d0_e.sh gemm_add_silu 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_silu_log + + #run gemm_add tests + export gemm_add_log="perf_gemm_add.log" + print_log_header $gemm_add_log $env_type $branch $host_name + ./profile_gemm_d0_e.sh gemm_add 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_log + ./profile_gemm_d0_e.sh gemm_add 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_log + + #run gemm_b_scale tests + export gemm_b_scale_log="perf_gemm_b_scale.log" + print_log_header $gemm_b_scale_log $env_type $branch $host_name + ./profile_gemm_b_scale.sh gemm_b_scale 8 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_b_scale_log + + #run gemm_bias_add_reduce tests + export gemm_bias_add_reduce_log="perf_gemm_bias_add_reduce.log" + print_log_header $gemm_bias_add_reduce_log $env_type $branch $host_name + ./profile_gemm_d0_e.sh gemm_bias_add_reduce 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_bias_add_reduce_log + ./profile_gemm_d0_e.sh gemm_bias_add_reduce 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_bias_add_reduce_log + + #run gemm_fastgelu tests + export gemm_fastgelu_log="perf_gemm_fastgelu.log" + print_log_header $gemm_fastgelu_log $env_type $branch $host_name + ./profile_gemm.sh gemm_fastgelu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_fastgelu_log + ./profile_gemm.sh gemm_fastgelu 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_fastgelu_log + + #run gemm_multiply_add tests + export gemm_multiply_add_log="perf_gemm_multiply_add.log" + print_log_header $gemm_multiply_add_log $env_type $branch $host_name + ./profile_gemm_d0_d1_e.sh gemm_multiply_add 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_add_log + ./profile_gemm_d0_d1_e.sh gemm_multiply_add 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_add_log + ./profile_gemm_d0_d1_e.sh gemm_multiply_add 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_add_log + ./profile_gemm_d0_d1_e.sh gemm_multiply_add 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_add_log + + #run gemm_multiply_multiply_weight_preshuffle tests + export gemm_multiply_multiply_weight_preshuffle_log="perf_gemm_multiply_multiply_weight_preshuffle.log" + print_log_header $gemm_multiply_multiply_weight_preshuffle_log $env_type $branch $host_name + ./profile_gemm_d0_d1_e.sh gemm_multiply_multiply_weight_preshuffle 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_weight_preshuffle_log + ./profile_gemm_d0_d1_e.sh gemm_multiply_multiply_weight_preshuffle 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_weight_preshuffle_log + + #run gemm_multiply_multiply tests + export gemm_multiply_multiply_log="perf_gemm_multiply_multiply.log" + print_log_header $gemm_multiply_multiply_log $env_type $branch $host_name + ./profile_gemm_d0_d1_e.sh gemm_multiply_multiply 7 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_log + ./profile_gemm_d0_d1_e.sh gemm_multiply_multiply 9 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_log + ./profile_gemm_d0_d1_e.sh gemm_multiply_multiply 10 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_log + + #run gemm_reduce tests + export gemm_reduce_log="perf_gemm_reduce.log" + print_log_header $gemm_reduce_log $env_type $branch $host_name + ./profile_splitK_gemm.sh gemm_reduce 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_reduce_log + ./profile_splitK_gemm.sh gemm_reduce 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_reduce_log + + #run gemm_streamk tests + export gemm_streamk_log="perf_gemm_streamk.log" + print_log_header $gemm_streamk_log $env_type $branch $host_name + ./profile_gemm.sh gemm_streamk 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log + ./profile_gemm.sh gemm_streamk 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log + ./profile_gemm.sh gemm_streamk 2 0 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log + ./profile_gemm.sh gemm_streamk 3 0 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log + ./profile_gemm.sh gemm_streamk 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log + ./profile_gemm.sh gemm_streamk 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log + ./profile_gemm.sh gemm_streamk 2 1 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log + ./profile_gemm.sh gemm_streamk 3 1 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log + + #run gemm_universal_batched tests + export gemm_universal_batched_log="perf_gemm_universal_batched.log" + print_log_header $gemm_universal_batched_log $env_type $branch $host_name + ./profile_gemm_universal_batched.sh gemm_universal_batched 0 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_batched_log + ./profile_gemm_universal_batched.sh gemm_universal_batched 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_batched_log + ./profile_gemm_universal_batched.sh gemm_universal_batched 0 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_batched_log + ./profile_gemm_universal_batched.sh gemm_universal_batched 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_batched_log + + #run gemm_universal_reduce tests + export gemm_universal_reduce_log="perf_gemm_universal_reduce.log" + print_log_header $gemm_universal_reduce_log $env_type $branch $host_name + ./profile_splitK_gemm.sh gemm_universal_reduce 0 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 2 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 3 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 4 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 5 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 6 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 0 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 2 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 3 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 4 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 5 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + ./profile_splitK_gemm.sh gemm_universal_reduce 6 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log + + #run gemm_universal_streamk tests + export gemm_universal_streamk_log="perf_gemm_universal_streamk.log" + print_log_header $gemm_universal_streamk_log $env_type $branch $host_name + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log + + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log + + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + ./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log + + #run gemm_universal tests + export gemm_universal_log="perf_gemm_universal.log" + print_log_header $gemm_universal_log $env_type $branch $host_name + ./profile_splitK_gemm.sh gemm_universal 0 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 2 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 3 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 4 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 5 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 6 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 7 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 8 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 9 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + + ./profile_splitK_gemm.sh gemm_universal 0 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 2 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 3 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 4 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 5 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 6 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 7 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 8 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + ./profile_splitK_gemm.sh gemm_universal 9 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log + + #run grouped_conv_fwd_outelementop tests + export grouped_conv_fwd_outelementop_log="perf_grouped_conv_fwd_outelementop.log" + print_log_header $grouped_conv_fwd_outelementop_log $env_type $branch $host_name + ./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 0 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log + ./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 1 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log + ./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 2 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log + ./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 3 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log + ./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 0 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log + ./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 1 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log + ./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 2 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log + ./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 3 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log + + #run grouped_gemm_fastgelu tests + export grouped_gemm_fastgelu_log="perf_grouped_gemm_fastgelu.log" + print_log_header $grouped_gemm_fastgelu_log $env_type $branch $host_name + ./profile_grouped_gemm.sh grouped_gemm_fastgelu 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fastgelu_log + ./profile_grouped_gemm.sh grouped_gemm_fastgelu 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fastgelu_log + + #run grouped_gemm_fixed_nk tests + export grouped_gemm_fixed_nk_log="perf_grouped_gemm_fixed_nk.log" + print_log_header $grouped_gemm_fixed_nk_log $env_type $branch $host_name + ./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 0 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log + ./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log + ./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 2 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log + ./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 3 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log + ./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 0 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log + ./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log + ./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 2 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log + ./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 3 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log + + #run grouped_gemm_multiply_tile_loop tests + export grouped_gemm_multiply_tile_loop_log="perf_grouped_gemm_multiply_tile_loop.log" + print_log_header $grouped_gemm_multiply_tile_loop_log $env_type $branch $host_name + ./profile_grouped_gemm.sh grouped_gemm_multiply_tile_loop 0 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_multiply_tile_loop_log + + #run grouped_gemm_tile_loop tests + export grouped_gemm_tile_loop_log="perf_grouped_gemm_tile_loop.log" + print_log_header $grouped_gemm_tile_loop_log $env_type $branch $host_name + ./profile_grouped_gemm.sh grouped_gemm_tile_loop 0 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_tile_loop_log + ./profile_grouped_gemm.sh grouped_gemm_tile_loop 0 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_tile_loop_log + + #run groupnorm tests + export groupnorm_log="perf_groupnorm.log" + print_log_header $groupnorm_log $env_type $branch $host_name + ./profile_groupnorm.sh groupnorm 0 $verify 1 0 1 2>&1 | tee -a $groupnorm_log + ./profile_groupnorm.sh groupnorm 1 $verify 1 0 1 2>&1 | tee -a $groupnorm_log + + #run permute_scale tests + export permute_scale_log="perf_permute_scale.log" + print_log_header $permute_scale_log $env_type $branch $host_name + ./profile_permute_scale.sh permute_scale 0 $verify 1 0 1 2>&1 | tee -a $permute_scale_log + ./profile_permute_scale.sh permute_scale 1 $verify 1 0 1 2>&1 | tee -a $permute_scale_log + + #run transpose tests + export transpose_log="perf_transpose.log" + print_log_header $transpose_log $env_type $branch $host_name + ./profile_transpose.sh transpose 0 $verify 1 0 1 2>&1 | tee -a $transpose_log + ./profile_transpose.sh transpose 1 $verify 1 0 1 2>&1 | tee -a $transpose_log + + #run avg_pool2d_bwd tests + export avg_pool2d_bwd_log="perf_avg_pool2d_bwd.log" + print_log_header $avg_pool2d_bwd_log $env_type $branch $host_name + ./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 0 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log + ./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 1 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log + ./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 3 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log + ./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 5 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log + ./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 7 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log + + #run avg_pool3d_bwd tests + export avg_pool3d_bwd_log="perf_avg_pool3d_bwd.log" + print_log_header $avg_pool3d_bwd_log $env_type $branch $host_name + ./profile_avg_pool3d_bwd.sh avg_pool3d_bwd 0 $verify 1 0 1 2>&1 | tee -a $avg_pool3d_bwd_log + ./profile_avg_pool3d_bwd.sh avg_pool3d_bwd 1 $verify 1 0 1 2>&1 | tee -a $avg_pool3d_bwd_log + ./profile_avg_pool3d_bwd.sh avg_pool3d_bwd 5 $verify 1 0 1 2>&1 | tee -a $avg_pool3d_bwd_log + + #run bnorm_bwd tests + export bnorm_bwd_log="perf_bnorm_bwd.log" + print_log_header $bnorm_bwd_log $env_type $branch $host_name + ./profile_bnorm.sh bnorm_bwd 0 $verify 0 1 0 2>&1 | tee -a $bnorm_bwd_log + ./profile_bnorm.sh bnorm_bwd 1 $verify 0 1 0 2>&1 | tee -a $bnorm_bwd_log + ./profile_bnorm.sh bnorm_bwd 5 $verify 0 1 0 2>&1 | tee -a $bnorm_bwd_log + ./profile_bnorm.sh bnorm_bwd 6 $verify 0 1 0 2>&1 | tee -a $bnorm_bwd_log + + #run bnorm_fwd tests + export bnorm_fwd_log="perf_bnorm_fwd.log" + print_log_header $bnorm_fwd_log $env_type $branch $host_name + ./profile_bnorm_fwd.sh bnorm_fwd 0 $verify 0 1 0 2>&1 | tee -a $bnorm_fwd_log + ./profile_bnorm_fwd.sh bnorm_fwd 1 $verify 0 1 0 2>&1 | tee -a $bnorm_fwd_log + ./profile_bnorm_fwd.sh bnorm_fwd 5 $verify 0 1 0 2>&1 | tee -a $bnorm_fwd_log + ./profile_bnorm_fwd.sh bnorm_fwd 6 $verify 0 1 0 2>&1 | tee -a $bnorm_fwd_log + + #run bnorm_infer tests + export bnorm_infer_log="perf_bnorm_infer.log" + print_log_header $bnorm_infer_log $env_type $branch $host_name + ./profile_bnorm.sh bnorm_infer 0 $verify 0 1 0 2>&1 | tee -a $bnorm_infer_log + ./profile_bnorm.sh bnorm_infer 1 $verify 0 1 0 2>&1 | tee -a $bnorm_infer_log + ./profile_bnorm.sh bnorm_infer 5 $verify 0 1 0 2>&1 | tee -a $bnorm_infer_log + ./profile_bnorm.sh bnorm_infer 6 $verify 0 1 0 2>&1 | tee -a $bnorm_infer_log + + #run groupnorm_bwd_data tests + export groupnorm_bwd_data_log="perf_groupnorm_bwd_data.log" + print_log_header $groupnorm_bwd_data_log $env_type $branch $host_name + ./profile_groupnorm.sh groupnorm_bwd_data 1 $verify 1 0 1 2>&1 | tee -a $groupnorm_bwd_data_log + + #run groupnorm_bwd_gamma_beta tests + export groupnorm_bwd_gamma_beta_log="perf_groupnorm_bwd_gamma_beta.log" + print_log_header $groupnorm_bwd_gamma_beta_log $env_type $branch $host_name + ./profile_groupnorm.sh groupnorm_bwd_gamma_beta 0 $verify 1 0 1 2>&1 | tee -a $groupnorm_bwd_gamma_beta_log + ./profile_groupnorm.sh groupnorm_bwd_gamma_beta 1 $verify 1 0 1 2>&1 | tee -a $groupnorm_bwd_gamma_beta_log + + #run layernorm_bwd_data tests + export layernorm_bwd_data_log="perf_layernorm_bwd_data.log" + print_log_header $layernorm_bwd_data_log $env_type $branch $host_name + ./profile_layernorm.sh layernorm_bwd_data 0 $verify 1 0 1 2>&1 | tee -a $layernorm_bwd_data_log + ./profile_layernorm.sh layernorm_bwd_data 1 $verify 1 0 1 2>&1 | tee -a $layernorm_bwd_data_log + + #run layernorm_bwd_gamma_beta tests + export layernorm_bwd_gamma_beta_log="perf_layernorm_bwd_gamma_beta.log" + print_log_header $layernorm_bwd_gamma_beta_log $env_type $branch $host_name + ./profile_layernorm.sh layernorm_bwd_gamma_beta 0 $verify 1 0 1 2>&1 | tee -a $layernorm_bwd_gamma_beta_log + ./profile_layernorm.sh layernorm_bwd_gamma_beta 1 $verify 1 0 1 2>&1 | tee -a $layernorm_bwd_gamma_beta_log + + #run layernorm_fwd tests + export layernorm_fwd_log="perf_layernorm_fwd.log" + print_log_header $layernorm_fwd_log $env_type $branch $host_name + ./profile_layernorm.sh layernorm_fwd 0 $verify 1 0 1 2>&1 | tee -a $layernorm_fwd_log + ./profile_layernorm.sh layernorm_fwd 1 $verify 1 0 1 2>&1 | tee -a $layernorm_fwd_log + + #run max_pool2d_bwd tests + export max_pool2d_bwd_log="perf_max_pool2d_bwd.log" + print_log_header $max_pool2d_bwd_log $env_type $branch $host_name + ./profile_max_pool2d_bwd.sh max_pool2d_bwd 0 $verify 1 0 1 2>&1 | tee -a $max_pool2d_bwd_log + ./profile_max_pool2d_bwd.sh max_pool2d_bwd 1 $verify 1 0 1 2>&1 | tee -a $max_pool2d_bwd_log + ./profile_max_pool2d_bwd.sh max_pool2d_bwd 3 $verify 1 0 1 2>&1 | tee -a $max_pool2d_bwd_log + ./profile_max_pool2d_bwd.sh max_pool2d_bwd 5 $verify 1 0 1 2>&1 | tee -a $max_pool2d_bwd_log + + #run max_pool2d_fwd tests + export max_pool2d_fwd_log="perf_max_pool2d_fwd.log" + print_log_header $max_pool2d_fwd_log $env_type $branch $host_name + ./profile_max_pool2d_fwd.sh max_pool2d_fwd 0 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log + ./profile_max_pool2d_fwd.sh max_pool2d_fwd 1 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log + ./profile_max_pool2d_fwd.sh max_pool2d_fwd 2 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log + ./profile_max_pool2d_fwd.sh max_pool2d_fwd 3 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log + ./profile_max_pool2d_fwd.sh max_pool2d_fwd 4 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log + + #run max_pool3d_bwd tests + export max_pool3d_bwd_log="perf_max_pool3d_bwd.log" + print_log_header $max_pool3d_bwd_log $env_type $branch $host_name + ./profile_max_pool3d.sh max_pool3d_bwd 0 $verify 1 0 1 2>&1 | tee -a $max_pool3d_bwd_log + ./profile_max_pool3d.sh max_pool3d_bwd 1 $verify 1 0 1 2>&1 | tee -a $max_pool3d_bwd_log + ./profile_max_pool3d.sh max_pool3d_bwd 5 $verify 1 0 1 2>&1 | tee -a $max_pool3d_bwd_log + + #run pool3d_fwd tests + export pool3d_fwd_log="perf_pool3d_fwd.log" + print_log_header $pool3d_fwd_log $env_type $branch $host_name + ./profile_pool3d_fwd.sh pool3d_fwd 0 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 1 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 3 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 5 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 7 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 0 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 1 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 3 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 5 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log + ./profile_pool3d_fwd.sh pool3d_fwd 7 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log + + #run softmax tests + export softmax_log="perf_softmax.log" + print_log_header $softmax_log $env_type $branch $host_name + ./profile_softmax.sh softmax 0 $verify 1 0 1 2>&1 | tee -a $softmax_log + ./profile_softmax.sh softmax 1 $verify 1 0 1 2>&1 | tee -a $softmax_log + ./profile_softmax.sh softmax 2 $verify 1 0 1 2>&1 | tee -a $softmax_log + ./profile_softmax.sh softmax 3 $verify 1 0 1 2>&1 | tee -a $softmax_log +fi \ No newline at end of file