[CK] Add test script for all ckProfiler ops

1. Add test scritp for all ckProfiler ops
2. Extend script run_full_performance_tests.sh to inlcude all ops.
This commit is contained in:
Qun Lin
2025-05-27 11:15:55 +08:00
parent b1ed92b131
commit a56a2b20cd
28 changed files with 1013 additions and 31 deletions

View File

@@ -0,0 +1,15 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time length(NCHW) window size(YX) stride dilation left pad right pad
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 30 30 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 --dmmy 28 29 30 31 32
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 64 256 64 64 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 --dmmy 28 29 30 31 32

View File

@@ -0,0 +1,15 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time length(NCDHW) window size(YX) stride dilation left pad right pad
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 30 30 30 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 64 64 64 64 64 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1

View File

@@ -0,0 +1,20 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype layout B_block_tile verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount KSplit
$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 8 1
$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 8 1
$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 4 1
$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 2 1
##todo: B_block_tile is incorect profile_batched_gemm_b_scale

View File

@@ -0,0 +1,17 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype layout verify init log time M___ N___ K___ O__ BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 1024 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 2048 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 4096 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 8192 2

View File

@@ -0,0 +1,37 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 2
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4096 4096 4096 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8192 8192 8192 2
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 2
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 2

13
script/profile_bnorm.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY="-v $3"
INIT=$4
TIME=$5
USE=$6
######## op datatype UseSavedMean init time inOutLengths(nhwc) reduceDims verify
$DRIVER $OP $DATATYPE $USE $INIT $TIME -D 64,64,280,82 -R 1,2,3 $VERIFY

14
script/profile_bnorm_fwd.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY="-v $3"
INIT=$4
TIME=$5
USE=$6
######## op datatype updateMovingAverage saveMeanAndInvVariance init time inOutLengths(nhwc) reduceDims verify
$DRIVER $OP $DATATYPE $USE 0 $INIT $TIME -D 64,64,280,82 -R 1,2,3 $VERIFY
$DRIVER $OP $DATATYPE $USE 1 $INIT $TIME -D 64,64,280,82 -R 1,2,3 $VERIFY

View File

@@ -0,0 +1,15 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype compute_datatype num_dim layout verify init log time alpha beta M0 M1 N0 N1 K0 K1
$DRIVER $OP $DATATYPE $DATATYPE 2 $LAYOUT $VERIFY $INIT $LOG $TIME 1.0 1.0 128 128 128 128 128 128

View File

@@ -0,0 +1,15 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype compute_datatype num_dim layout verify init log time alpha M0 M1 N0 N1 K0 K1
$DRIVER $OP $DATATYPE $DATATYPE 2 $LAYOUT $VERIFY $INIT $LOG $TIME 1.0 128 128 128 128 128 128

34
script/profile_conv.sh Executable file
View File

@@ -0,0 +1,34 @@
#!/bin/bash
## GPU visibility
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
N=$8
######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3

View File

@@ -0,0 +1,19 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
N=$8
######## op datatype layout layout layout verify init log time N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
$DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $LAYOUT $LAYOUT $VERIFY $INIT $LOG $TIME $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1

View File

@@ -0,0 +1,21 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
OPTYPE=$8
N=$9
######## op datatype layout verify init log time op_type Dim G N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME $OPTYPE 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1

18
script/profile_gemm_b_scale.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
KBatch=$8
######## op datatype layout B_block_tile verify init log time M___ N___ K___ StrideA StrideB StrideC KBatch
$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT 1 $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 $KBatch

19
script/profile_gemm_d0_d1_e.sh Executable file
View File

@@ -0,0 +1,19 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD0 StrideD1 StrideE
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 1408 1024 -1 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 2816 2048 -1 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 1408 2048 -1 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 2816 2048 -1 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 5120 5632 4096 -1 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7040 8192 8192 -1 -1 -1 -1 -1

19
script/profile_gemm_d0_e.sh Executable file
View File

@@ -0,0 +1,19 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD0 StrideE
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 1408 1024 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 2816 2048 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 1408 2048 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 2816 2048 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 5120 5632 4096 -1 -1 -1 -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7040 8192 8192 -1 -1 -1 -1

View File

@@ -0,0 +1,36 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
KBatch=$8
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 -1 -1 -1 8 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 -1 -1 -1 8 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 -1 -1 -1 4 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 -1 -1 -1 2 $KBatch
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024 -1 -1 -1 8 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048 -1 -1 -1 8 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4096 4096 4096 -1 -1 -1 4 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8192 8192 8192 -1 -1 -1 2 $KBatch
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 -1 -1 -1 8 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 -1 -1 -1 8 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 -1 -1 -1 4 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 -1 -1 -1 2 $KBatch
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 -1 -1 -1 8 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 -1 -1 -1 8 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 -1 -1 -1 4 $KBatch
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 -1 -1 -1 2 $KBatch

View File

@@ -0,0 +1,40 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
STRATEGY=$8
# 120 CU
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC STRATEGY_ GridSize
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 $STRATEGY -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 2048 2048 -1 -1 -1 $STRATEGY -1
# 104 CU
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC STRATEGY_
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 832 1024 1024 -1 -1 -1 $STRATEGY -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 832 2048 2048 -1 -1 -1 $STRATEGY -1
# 110 CU
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC STRATEGY_
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 1408 1024 -1 -1 -1 $STRATEGY -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 2816 2048 -1 -1 -1 $STRATEGY -1
# testing different strides
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC STRATEGY_
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024 $STRATEGY -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048 $STRATEGY -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 $STRATEGY -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 $STRATEGY -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 $STRATEGY -1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 $STRATEGY -1

View File

@@ -15,24 +15,24 @@ TIME=$7
N=$8
# Resnet50
######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads KSplit
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 1

View File

@@ -0,0 +1,17 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype layout verify init log time Ms______________ Ns______________ Ks_____________ StrideAs___________ StrideBs__________ StrideCs___________
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960,960,960,960 1024,1024,1024,1024 1024,1024,1024,1024 960,960,960,960 1024,1024,1024,1024 1024,1024,1024,1024
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960,960,960,960 2048,2048,2048,2048 2048,2048,2048,2048 960,960,960,960 2048,2048,2048,2048 2048,2048,2048,2048
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840,3840,3840,3840 4096,4096,4096,4096 4096,4096,4096,4096 3840,3840,3840,3840 4096,4096,4096,4096 4096,4096,4096,4096
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680,7680,7680,7680 8192,8192,8192,8192 8192,8192,8192,8192 7680,7680,7680,7680 8192,8192,8192,8192 8192,8192,8192,8192

15
script/profile_groupnorm.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time length
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 1 16 16 32 40
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 64 64 64 64
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 64 64 128 128

15
script/profile_layernorm.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time length
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 256 256
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 1024 1024
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 4096 4096

View File

@@ -0,0 +1,14 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time length(NCHW) window size(YX) stride dilation left pad right pad
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 30 30 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 --dmmy 28 29 30 31 32
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 64 32 256 256 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1 --dmmy 28 29 30 31 32

View File

@@ -0,0 +1,16 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time return_idx length(NCHW) window size(YX) stride dilation left pad right pad
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 0 --length 2 32 30 30 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 0 --length 64 32 256 256 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 1 --length 2 32 30 30 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 1 --length 64 32 256 256 --wsize 2 2 --wstride 2 2 --wdilation 1 1 --pad1 1 1 --pad2 1 1

14
script/profile_max_pool3d.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time length(NCDHW) window size(YX) stride dilation left pad right pad
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 30 30 30 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 8 16 32 256 256 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1

17
script/profile_pool3d_fwd.sh Executable file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
REDUCEOP=$7
######## op datatype verify init log time return_index reduce_op length(NCDHW) window size(YX) stride dilation left pad right pad
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 0 $REDUCEOP --length 2 32 30 30 30 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 0 $REDUCEOP --length 8 16 32 256 256 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 1 $REDUCEOP --length 2 32 30 30 30 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 1 $REDUCEOP --length 8 16 32 256 256 --wsize 2 2 2 --wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1

18
script/profile_softmax.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time length stride reduce
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 8 4 256 --stride 1024 256 1 --reduce 2
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 8 128 1024 --stride 2097152 1048576 131072 1 --reduce 2
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 8 128 1024 --stride 2097152 1048576 131072 1 --reduce 3
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 512 4096 --stride 134217728 67108864 2097152 1 --reduce 2
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME --length 2 32 512 4096 --stride 134217728 67108864 2097152 1 --reduce 3

13
script/profile_transpose.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
VERIFY=$3
INIT=$4
LOG=$5
TIME=$6
######## op datatype verify init log time N C D H W
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 4 8 8 512 512

View File

@@ -12,6 +12,8 @@
# environment tag : a string describing the specifics of your test environment
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# node name : $hostname
# extended op = 0 : do not include extended op (default)
# = 1 : include extended op
#get the command line arguments:
export verify=$1
@@ -22,16 +24,19 @@ export branch=$3
echo 'Branch name: ' $branch
export host_name=$4
echo 'Host name: ' $host_name
export inlcude_extended_op=$5
echo 'Inlcude extended op: ' $inlcude_extended_op
function print_log_header(){
rm -f $1;
echo 'On branch ' $3 &> $1;
echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
rm -f $1;
echo 'On branch ' $3 &> $1;
echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
}
#run gemm tests
@@ -103,6 +108,9 @@ print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
#run grouped_bwd_weight tests
export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight.log"
@@ -148,4 +156,472 @@ print_log_header $onnx_log $env_type $branch $host_name
export mixed_gemm_log="perf_mixed_gemm.log"
print_log_header $mixed_gemm_log $env_type $branch $host_name
./profile_mixed_gemm.sh gemm_splitk 4 0 $verify 2 0 1 16 2>&1 | tee -a $mixed_gemm_log
./profile_mixed_gemm.sh gemm_splitk 5 0 $verify 2 0 1 16 2>&1 | tee -a $mixed_gemm_log
./profile_mixed_gemm.sh gemm_splitk 5 0 $verify 2 0 1 16 2>&1 | tee -a $mixed_gemm_log
if [ "$inlcude_extended_op" = "1" ]; then
#run batched_gemm_add_relu_gemm_add tests
export batched_gemm_add_relu_gemm_add_log="perf_batched_gemm_add_relu_gemm_add.log"
print_log_header $batched_gemm_add_relu_gemm_add_log $env_type $branch $host_name
./profile_batched_gemm_gemm.sh batched_gemm_add_relu_gemm_add 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_add_relu_gemm_add_log
./profile_batched_gemm_gemm.sh batched_gemm_add_relu_gemm_add 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_add_relu_gemm_add_log
#run batched_gemm_b_scale tests
export batched_gemm_b_scale_log="perf_batched_gemm_b_scale.log"
print_log_header $batched_gemm_b_scale_log $env_type $branch $host_name
./profile_batched_gemm_b_scale.sh batched_gemm_b_scale 8 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_b_scale_log
#run batched_gemm_gemm tests
export batched_gemm_gemm_log="perf_batched_gemm_gemm.log"
print_log_header $batched_gemm_gemm_log $env_type $branch $host_name
./profile_batched_gemm_gemm.sh batched_gemm_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_gemm_log
./profile_batched_gemm_gemm.sh batched_gemm_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_gemm_log
#run batched_gemm_multi_d tests
export batched_gemm_multi_d_log="perf_batched_gemm_multi_d.log"
print_log_header $batched_gemm_multi_d_log $env_type $branch $host_name
./profile_batched_gemm.sh batched_gemm_multi_d 0 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log
./profile_batched_gemm.sh batched_gemm_multi_d 0 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log
./profile_batched_gemm.sh batched_gemm_multi_d 0 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log
./profile_batched_gemm.sh batched_gemm_multi_d 0 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log
./profile_batched_gemm.sh batched_gemm_multi_d 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log
./profile_batched_gemm.sh batched_gemm_multi_d 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log
./profile_batched_gemm.sh batched_gemm_multi_d 1 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log
./profile_batched_gemm.sh batched_gemm_multi_d 1 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_multi_d_log
#run batched_gemm_reduce tests
export batched_gemm_reduce_log="perf_batched_gemm_reduce.log"
print_log_header $batched_gemm_reduce_log $env_type $branch $host_name
./profile_batched_gemm_reduce.sh batched_gemm_reduce 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_reduce_log
./profile_batched_gemm_reduce.sh batched_gemm_reduce 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_reduce_log
./profile_batched_gemm_reduce.sh batched_gemm_reduce 1 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_reduce_log
./profile_batched_gemm_reduce.sh batched_gemm_reduce 1 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_reduce_log
#run contraction_bilinear tests
export contraction_bilinear_log="perf_contraction_bilinear.log"
print_log_header $contraction_bilinear_log $env_type $branch $host_name
./profile_contraction_bilinear.sh contraction_bilinear 0 0 $verify 1 0 1 2>&1 | tee -a $contraction_bilinear_log
./profile_contraction_bilinear.sh contraction_bilinear 1 0 $verify 1 0 1 2>&1 | tee -a $contraction_bilinear_log
#run contraction_scale tests
export contraction_scale_log="perf_contraction_scale.log"
print_log_header $contraction_scale_log $env_type $branch $host_name
./profile_contraction_scale.sh contraction_scale 0 0 $verify 1 0 1 2>&1 | tee -a $contraction_scale_log
./profile_contraction_scale.sh contraction_scale 1 0 $verify 1 0 1 2>&1 | tee -a $contraction_scale_log
#run conv_bwd_data tests
export conv_bwd_data_log="perf_conv_bwd_data.log"
print_log_header $conv_bwd_data_log $env_type $branch $host_name
./profile_conv.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
./profile_conv.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
./profile_conv.sh conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
./profile_conv.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
#run conv_fwd_bias_relu_add tests
export conv_fwd_bias_relu_add_log="perf_conv_fwd_bias_relu_add.log"
print_log_header $conv_fwd_bias_relu_add_log $env_type $branch $host_name
./profile_conv_fwd_bias_relu_add.sh conv_fwd_bias_relu_add 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_bias_relu_add_log
#run conv_fwd tests
export conv_fwd_log="perf_conv_fwd.log"
print_log_header $conv_fwd_log $env_type $branch $host_name
./profile_conv.sh conv_fwd 0 0 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
./profile_conv.sh conv_fwd 1 0 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
./profile_conv.sh conv_fwd 2 0 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
./profile_conv.sh conv_fwd 3 0 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
./profile_conv.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
./profile_conv.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
./profile_conv.sh conv_fwd 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
./profile_conv.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
#run conv_tensor_rearrange tests
export conv_tensor_rearrange_log="perf_conv_tensor_rearrange.log"
print_log_header $conv_tensor_rearrange_log $env_type $branch $host_name
./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 0 0 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log
./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 1 0 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log
./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 2 0 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log
./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 3 0 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log
./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 1 1 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log
./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 3 1 $verify 1 0 1 0 256 2>&1 | tee -a $conv_tensor_rearrange_log
./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 1 0 $verify 1 0 1 1 256 2>&1 | tee -a $conv_tensor_rearrange_log
./profile_conv_tensor_rearrange.sh conv_tensor_rearrange 1 1 $verify 1 0 1 1 256 2>&1 | tee -a $conv_tensor_rearrange_log
#run gemm_ab_scale tests
export gemm_ab_scale_log="perf_gemm_ab_scale.log"
print_log_header $gemm_ab_scale_log $env_type $branch $host_name
./profile_gemm_b_scale.sh gemm_ab_scale 7 1 $verify 1 0 1 2>&1 | tee -a $gemm_ab_scale_log
#run gemm_add_add_fastgelu tests
export gemm_add_add_fastgelu_log="perf_gemm_add_add_fastgelu.log"
print_log_header $gemm_add_add_fastgelu_log $env_type $branch $host_name
./profile_gemm_d0_d1_e.sh gemm_add_add_fastgelu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_add_fastgelu_log
./profile_gemm_d0_d1_e.sh gemm_add_add_fastgelu 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_add_fastgelu_log
#run gemm_add_fastgelu tests
export gemm_add_fastgelu_log="perf_gemm_add_fastgelu.log"
print_log_header $gemm_add_fastgelu_log $env_type $branch $host_name
./profile_gemm_d0_e.sh gemm_add_fastgelu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_fastgelu_log
./profile_gemm_d0_e.sh gemm_add_fastgelu 2 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_fastgelu_log
./profile_gemm_d0_e.sh gemm_add_fastgelu 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_fastgelu_log
./profile_gemm_d0_e.sh gemm_add_fastgelu 2 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_fastgelu_log
#run gemm_add_multiply tests
export gemm_add_multiply_log="perf_gemm_add_multiply.log"
print_log_header $gemm_add_multiply_log $env_type $branch $host_name
./profile_gemm_d0_d1_e.sh gemm_add_multiply 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_multiply_log
./profile_gemm_d0_d1_e.sh gemm_add_multiply 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_multiply_log
#run gemm_add_relu_add_layernorm tests
export gemm_add_relu_add_layernorm_log="perf_gemm_add_relu_add_layernorm.log"
print_log_header $gemm_add_relu_add_layernorm_log $env_type $branch $host_name
./profile_gemm_d0_d1_e.sh gemm_add_relu_add_layernorm 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_relu_add_layernorm_log
./profile_gemm_d0_d1_e.sh gemm_add_relu_add_layernorm 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_relu_add_layernorm_log
#run gemm_add_relu tests
export gemm_add_relu_log="perf_gemm_add_relu.log"
print_log_header $gemm_add_relu_log $env_type $branch $host_name
./profile_gemm_d0_e.sh gemm_add_relu 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_relu_log
./profile_gemm_d0_e.sh gemm_add_relu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_relu_log
#run gemm_add_silu tests
export gemm_add_silu_log="perf_gemm_add_silu.log"
print_log_header $gemm_add_silu_log $env_type $branch $host_name
./profile_gemm_d0_e.sh gemm_add_silu 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_silu_log
./profile_gemm_d0_e.sh gemm_add_silu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_silu_log
./profile_gemm_d0_e.sh gemm_add_silu 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_silu_log
./profile_gemm_d0_e.sh gemm_add_silu 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_silu_log
#run gemm_add tests
export gemm_add_log="perf_gemm_add.log"
print_log_header $gemm_add_log $env_type $branch $host_name
./profile_gemm_d0_e.sh gemm_add 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_add_log
./profile_gemm_d0_e.sh gemm_add 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_add_log
#run gemm_b_scale tests
export gemm_b_scale_log="perf_gemm_b_scale.log"
print_log_header $gemm_b_scale_log $env_type $branch $host_name
./profile_gemm_b_scale.sh gemm_b_scale 8 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_b_scale_log
#run gemm_bias_add_reduce tests
export gemm_bias_add_reduce_log="perf_gemm_bias_add_reduce.log"
print_log_header $gemm_bias_add_reduce_log $env_type $branch $host_name
./profile_gemm_d0_e.sh gemm_bias_add_reduce 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_bias_add_reduce_log
./profile_gemm_d0_e.sh gemm_bias_add_reduce 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_bias_add_reduce_log
#run gemm_fastgelu tests
export gemm_fastgelu_log="perf_gemm_fastgelu.log"
print_log_header $gemm_fastgelu_log $env_type $branch $host_name
./profile_gemm.sh gemm_fastgelu 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_fastgelu_log
./profile_gemm.sh gemm_fastgelu 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_fastgelu_log
#run gemm_multiply_add tests
export gemm_multiply_add_log="perf_gemm_multiply_add.log"
print_log_header $gemm_multiply_add_log $env_type $branch $host_name
./profile_gemm_d0_d1_e.sh gemm_multiply_add 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_add_log
./profile_gemm_d0_d1_e.sh gemm_multiply_add 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_add_log
./profile_gemm_d0_d1_e.sh gemm_multiply_add 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_add_log
./profile_gemm_d0_d1_e.sh gemm_multiply_add 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_add_log
#run gemm_multiply_multiply_weight_preshuffle tests
export gemm_multiply_multiply_weight_preshuffle_log="perf_gemm_multiply_multiply_weight_preshuffle.log"
print_log_header $gemm_multiply_multiply_weight_preshuffle_log $env_type $branch $host_name
./profile_gemm_d0_d1_e.sh gemm_multiply_multiply_weight_preshuffle 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_weight_preshuffle_log
./profile_gemm_d0_d1_e.sh gemm_multiply_multiply_weight_preshuffle 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_weight_preshuffle_log
#run gemm_multiply_multiply tests
export gemm_multiply_multiply_log="perf_gemm_multiply_multiply.log"
print_log_header $gemm_multiply_multiply_log $env_type $branch $host_name
./profile_gemm_d0_d1_e.sh gemm_multiply_multiply 7 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_log
./profile_gemm_d0_d1_e.sh gemm_multiply_multiply 9 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_log
./profile_gemm_d0_d1_e.sh gemm_multiply_multiply 10 1 $verify 1 0 1 2>&1 | tee -a $gemm_multiply_multiply_log
#run gemm_reduce tests
export gemm_reduce_log="perf_gemm_reduce.log"
print_log_header $gemm_reduce_log $env_type $branch $host_name
./profile_splitK_gemm.sh gemm_reduce 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_reduce_log
./profile_splitK_gemm.sh gemm_reduce 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_reduce_log
#run gemm_streamk tests
export gemm_streamk_log="perf_gemm_streamk.log"
print_log_header $gemm_streamk_log $env_type $branch $host_name
./profile_gemm.sh gemm_streamk 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log
./profile_gemm.sh gemm_streamk 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log
./profile_gemm.sh gemm_streamk 2 0 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log
./profile_gemm.sh gemm_streamk 3 0 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log
./profile_gemm.sh gemm_streamk 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log
./profile_gemm.sh gemm_streamk 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log
./profile_gemm.sh gemm_streamk 2 1 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log
./profile_gemm.sh gemm_streamk 3 1 $verify 1 0 1 2>&1 | tee -a $gemm_streamk_log
#run gemm_universal_batched tests
export gemm_universal_batched_log="perf_gemm_universal_batched.log"
print_log_header $gemm_universal_batched_log $env_type $branch $host_name
./profile_gemm_universal_batched.sh gemm_universal_batched 0 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_batched_log
./profile_gemm_universal_batched.sh gemm_universal_batched 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_batched_log
./profile_gemm_universal_batched.sh gemm_universal_batched 0 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_batched_log
./profile_gemm_universal_batched.sh gemm_universal_batched 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_batched_log
#run gemm_universal_reduce tests
export gemm_universal_reduce_log="perf_gemm_universal_reduce.log"
print_log_header $gemm_universal_reduce_log $env_type $branch $host_name
./profile_splitK_gemm.sh gemm_universal_reduce 0 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 2 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 3 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 4 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 5 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 6 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 0 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 2 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 3 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 4 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 5 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
./profile_splitK_gemm.sh gemm_universal_reduce 6 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_reduce_log
#run gemm_universal_streamk tests
export gemm_universal_streamk_log="perf_gemm_universal_streamk.log"
print_log_header $gemm_universal_streamk_log $env_type $branch $host_name
./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 0 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 1 $verify 1 0 1 0 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 0 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 0 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 1 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 2 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 3 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 4 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 5 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
./profile_gemm_universal_streamk.sh gemm_universal_streamk 6 1 $verify 1 0 1 2 2>&1 | tee -a $gemm_universal_streamk_log
#run gemm_universal tests
export gemm_universal_log="perf_gemm_universal.log"
print_log_header $gemm_universal_log $env_type $branch $host_name
./profile_splitK_gemm.sh gemm_universal 0 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 1 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 2 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 3 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 4 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 5 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 6 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 7 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 8 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 9 0 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 0 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 1 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 2 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 3 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 4 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 5 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 6 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 7 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 8 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
./profile_splitK_gemm.sh gemm_universal 9 1 $verify 1 0 1 1 2>&1 | tee -a $gemm_universal_log
#run grouped_conv_fwd_outelementop tests
export grouped_conv_fwd_outelementop_log="perf_grouped_conv_fwd_outelementop.log"
print_log_header $grouped_conv_fwd_outelementop_log $env_type $branch $host_name
./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 0 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log
./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 1 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log
./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 2 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log
./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 3 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log
./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 0 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log
./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 1 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log
./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 2 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log
./profile_grouped_conv_fwd_outelementop.sh grouped_conv_fwd_outelementop 3 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_outelementop_log
#run grouped_gemm_fastgelu tests
export grouped_gemm_fastgelu_log="perf_grouped_gemm_fastgelu.log"
print_log_header $grouped_gemm_fastgelu_log $env_type $branch $host_name
./profile_grouped_gemm.sh grouped_gemm_fastgelu 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fastgelu_log
./profile_grouped_gemm.sh grouped_gemm_fastgelu 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fastgelu_log
#run grouped_gemm_fixed_nk tests
export grouped_gemm_fixed_nk_log="perf_grouped_gemm_fixed_nk.log"
print_log_header $grouped_gemm_fixed_nk_log $env_type $branch $host_name
./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 0 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log
./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log
./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 2 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log
./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 3 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log
./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 0 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log
./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log
./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 2 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log
./profile_grouped_gemm_fixed_nk.sh grouped_gemm_fixed_nk 3 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_fixed_nk_log
#run grouped_gemm_multiply_tile_loop tests
export grouped_gemm_multiply_tile_loop_log="perf_grouped_gemm_multiply_tile_loop.log"
print_log_header $grouped_gemm_multiply_tile_loop_log $env_type $branch $host_name
./profile_grouped_gemm.sh grouped_gemm_multiply_tile_loop 0 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_multiply_tile_loop_log
#run grouped_gemm_tile_loop tests
export grouped_gemm_tile_loop_log="perf_grouped_gemm_tile_loop.log"
print_log_header $grouped_gemm_tile_loop_log $env_type $branch $host_name
./profile_grouped_gemm.sh grouped_gemm_tile_loop 0 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_tile_loop_log
./profile_grouped_gemm.sh grouped_gemm_tile_loop 0 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_tile_loop_log
#run groupnorm tests
export groupnorm_log="perf_groupnorm.log"
print_log_header $groupnorm_log $env_type $branch $host_name
./profile_groupnorm.sh groupnorm 0 $verify 1 0 1 2>&1 | tee -a $groupnorm_log
./profile_groupnorm.sh groupnorm 1 $verify 1 0 1 2>&1 | tee -a $groupnorm_log
#run permute_scale tests
export permute_scale_log="perf_permute_scale.log"
print_log_header $permute_scale_log $env_type $branch $host_name
./profile_permute_scale.sh permute_scale 0 $verify 1 0 1 2>&1 | tee -a $permute_scale_log
./profile_permute_scale.sh permute_scale 1 $verify 1 0 1 2>&1 | tee -a $permute_scale_log
#run transpose tests
export transpose_log="perf_transpose.log"
print_log_header $transpose_log $env_type $branch $host_name
./profile_transpose.sh transpose 0 $verify 1 0 1 2>&1 | tee -a $transpose_log
./profile_transpose.sh transpose 1 $verify 1 0 1 2>&1 | tee -a $transpose_log
#run avg_pool2d_bwd tests
export avg_pool2d_bwd_log="perf_avg_pool2d_bwd.log"
print_log_header $avg_pool2d_bwd_log $env_type $branch $host_name
./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 0 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log
./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 1 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log
./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 3 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log
./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 5 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log
./profile_avg_pool2d_bwd.sh avg_pool2d_bwd 7 $verify 1 0 1 2>&1 | tee -a $avg_pool2d_bwd_log
#run avg_pool3d_bwd tests
export avg_pool3d_bwd_log="perf_avg_pool3d_bwd.log"
print_log_header $avg_pool3d_bwd_log $env_type $branch $host_name
./profile_avg_pool3d_bwd.sh avg_pool3d_bwd 0 $verify 1 0 1 2>&1 | tee -a $avg_pool3d_bwd_log
./profile_avg_pool3d_bwd.sh avg_pool3d_bwd 1 $verify 1 0 1 2>&1 | tee -a $avg_pool3d_bwd_log
./profile_avg_pool3d_bwd.sh avg_pool3d_bwd 5 $verify 1 0 1 2>&1 | tee -a $avg_pool3d_bwd_log
#run bnorm_bwd tests
export bnorm_bwd_log="perf_bnorm_bwd.log"
print_log_header $bnorm_bwd_log $env_type $branch $host_name
./profile_bnorm.sh bnorm_bwd 0 $verify 0 1 0 2>&1 | tee -a $bnorm_bwd_log
./profile_bnorm.sh bnorm_bwd 1 $verify 0 1 0 2>&1 | tee -a $bnorm_bwd_log
./profile_bnorm.sh bnorm_bwd 5 $verify 0 1 0 2>&1 | tee -a $bnorm_bwd_log
./profile_bnorm.sh bnorm_bwd 6 $verify 0 1 0 2>&1 | tee -a $bnorm_bwd_log
#run bnorm_fwd tests
export bnorm_fwd_log="perf_bnorm_fwd.log"
print_log_header $bnorm_fwd_log $env_type $branch $host_name
./profile_bnorm_fwd.sh bnorm_fwd 0 $verify 0 1 0 2>&1 | tee -a $bnorm_fwd_log
./profile_bnorm_fwd.sh bnorm_fwd 1 $verify 0 1 0 2>&1 | tee -a $bnorm_fwd_log
./profile_bnorm_fwd.sh bnorm_fwd 5 $verify 0 1 0 2>&1 | tee -a $bnorm_fwd_log
./profile_bnorm_fwd.sh bnorm_fwd 6 $verify 0 1 0 2>&1 | tee -a $bnorm_fwd_log
#run bnorm_infer tests
export bnorm_infer_log="perf_bnorm_infer.log"
print_log_header $bnorm_infer_log $env_type $branch $host_name
./profile_bnorm.sh bnorm_infer 0 $verify 0 1 0 2>&1 | tee -a $bnorm_infer_log
./profile_bnorm.sh bnorm_infer 1 $verify 0 1 0 2>&1 | tee -a $bnorm_infer_log
./profile_bnorm.sh bnorm_infer 5 $verify 0 1 0 2>&1 | tee -a $bnorm_infer_log
./profile_bnorm.sh bnorm_infer 6 $verify 0 1 0 2>&1 | tee -a $bnorm_infer_log
#run groupnorm_bwd_data tests
export groupnorm_bwd_data_log="perf_groupnorm_bwd_data.log"
print_log_header $groupnorm_bwd_data_log $env_type $branch $host_name
./profile_groupnorm.sh groupnorm_bwd_data 1 $verify 1 0 1 2>&1 | tee -a $groupnorm_bwd_data_log
#run groupnorm_bwd_gamma_beta tests
export groupnorm_bwd_gamma_beta_log="perf_groupnorm_bwd_gamma_beta.log"
print_log_header $groupnorm_bwd_gamma_beta_log $env_type $branch $host_name
./profile_groupnorm.sh groupnorm_bwd_gamma_beta 0 $verify 1 0 1 2>&1 | tee -a $groupnorm_bwd_gamma_beta_log
./profile_groupnorm.sh groupnorm_bwd_gamma_beta 1 $verify 1 0 1 2>&1 | tee -a $groupnorm_bwd_gamma_beta_log
#run layernorm_bwd_data tests
export layernorm_bwd_data_log="perf_layernorm_bwd_data.log"
print_log_header $layernorm_bwd_data_log $env_type $branch $host_name
./profile_layernorm.sh layernorm_bwd_data 0 $verify 1 0 1 2>&1 | tee -a $layernorm_bwd_data_log
./profile_layernorm.sh layernorm_bwd_data 1 $verify 1 0 1 2>&1 | tee -a $layernorm_bwd_data_log
#run layernorm_bwd_gamma_beta tests
export layernorm_bwd_gamma_beta_log="perf_layernorm_bwd_gamma_beta.log"
print_log_header $layernorm_bwd_gamma_beta_log $env_type $branch $host_name
./profile_layernorm.sh layernorm_bwd_gamma_beta 0 $verify 1 0 1 2>&1 | tee -a $layernorm_bwd_gamma_beta_log
./profile_layernorm.sh layernorm_bwd_gamma_beta 1 $verify 1 0 1 2>&1 | tee -a $layernorm_bwd_gamma_beta_log
#run layernorm_fwd tests
export layernorm_fwd_log="perf_layernorm_fwd.log"
print_log_header $layernorm_fwd_log $env_type $branch $host_name
./profile_layernorm.sh layernorm_fwd 0 $verify 1 0 1 2>&1 | tee -a $layernorm_fwd_log
./profile_layernorm.sh layernorm_fwd 1 $verify 1 0 1 2>&1 | tee -a $layernorm_fwd_log
#run max_pool2d_bwd tests
export max_pool2d_bwd_log="perf_max_pool2d_bwd.log"
print_log_header $max_pool2d_bwd_log $env_type $branch $host_name
./profile_max_pool2d_bwd.sh max_pool2d_bwd 0 $verify 1 0 1 2>&1 | tee -a $max_pool2d_bwd_log
./profile_max_pool2d_bwd.sh max_pool2d_bwd 1 $verify 1 0 1 2>&1 | tee -a $max_pool2d_bwd_log
./profile_max_pool2d_bwd.sh max_pool2d_bwd 3 $verify 1 0 1 2>&1 | tee -a $max_pool2d_bwd_log
./profile_max_pool2d_bwd.sh max_pool2d_bwd 5 $verify 1 0 1 2>&1 | tee -a $max_pool2d_bwd_log
#run max_pool2d_fwd tests
export max_pool2d_fwd_log="perf_max_pool2d_fwd.log"
print_log_header $max_pool2d_fwd_log $env_type $branch $host_name
./profile_max_pool2d_fwd.sh max_pool2d_fwd 0 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log
./profile_max_pool2d_fwd.sh max_pool2d_fwd 1 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log
./profile_max_pool2d_fwd.sh max_pool2d_fwd 2 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log
./profile_max_pool2d_fwd.sh max_pool2d_fwd 3 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log
./profile_max_pool2d_fwd.sh max_pool2d_fwd 4 $verify 1 0 1 2>&1 | tee -a $max_pool2d_fwd_log
#run max_pool3d_bwd tests
export max_pool3d_bwd_log="perf_max_pool3d_bwd.log"
print_log_header $max_pool3d_bwd_log $env_type $branch $host_name
./profile_max_pool3d.sh max_pool3d_bwd 0 $verify 1 0 1 2>&1 | tee -a $max_pool3d_bwd_log
./profile_max_pool3d.sh max_pool3d_bwd 1 $verify 1 0 1 2>&1 | tee -a $max_pool3d_bwd_log
./profile_max_pool3d.sh max_pool3d_bwd 5 $verify 1 0 1 2>&1 | tee -a $max_pool3d_bwd_log
#run pool3d_fwd tests
export pool3d_fwd_log="perf_pool3d_fwd.log"
print_log_header $pool3d_fwd_log $env_type $branch $host_name
./profile_pool3d_fwd.sh pool3d_fwd 0 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 1 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 3 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 5 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 7 $verify 1 0 1 0 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 0 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 1 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 3 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 5 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log
./profile_pool3d_fwd.sh pool3d_fwd 7 $verify 1 0 1 1 2>&1 | tee -a $pool3d_fwd_log
#run softmax tests
export softmax_log="perf_softmax.log"
print_log_header $softmax_log $env_type $branch $host_name
./profile_softmax.sh softmax 0 $verify 1 0 1 2>&1 | tee -a $softmax_log
./profile_softmax.sh softmax 1 $verify 1 0 1 2>&1 | tee -a $softmax_log
./profile_softmax.sh softmax 2 $verify 1 0 1 2>&1 | tee -a $softmax_log
./profile_softmax.sh softmax 3 $verify 1 0 1 2>&1 | tee -a $softmax_log
fi