mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 18:17:44 +00:00
* start fixing 16bit data packing
* adding StaticTensor
* adding StaticTensor
* adding StaticTensor
* add missing constexpr
* adding static tensor
* adding static tensor
* adding transpose
* add inline asm for transpose 2x2 of half_t
* add general transpose_vectors(), but have unnecessary register initialization using v_mov
* fix unnecessary register initialization in transpose_vector by using more pass-by-reference
* add hardcoded logic for NHWC wrw
* improve asm for v_pack
* make ThreadwiseTensorSliceTransfer_v3r2 support any tensor
* tweak
* reorganize file
[ROCm/composable_kernel commit: b491ebf384]
25 lines
873 B
Bash
Executable File
25 lines
873 B
Bash
Executable File
#!/bin/bash
|
|
|
|
## GPU visibility
|
|
export HIP_VISIBLE_DEVICES=0
|
|
|
|
make -j ckProfiler
|
|
|
|
DRIVER="./profiler/ckProfiler"
|
|
|
|
OP=$1
|
|
DATATYPE=$2
|
|
LAYOUT=$3
|
|
VERIFY=$4
|
|
INIT=$5
|
|
LOG=$6
|
|
REPEAT=$7
|
|
|
|
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
|
|
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 256 256 256 256 256 256
|
|
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 1024 1024 1024
|
|
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
|
|
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 2048 2048 2048
|
|
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 4096 4096 4096
|
|
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 8192 8192 8192
|