mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 02:54:21 +00:00
* Use thread cluster descriptor and explicit M_K 2d descriptor to simply Blockwise Reduction
* Change by replacing ReduceDims by NumReduceDims as Device Reduce interface template parameter
* Rename the folder name for the pool2d and reduce examples
* Update to reduction test scripts
* Add Readme for pool2d_fwd and reduce_blockwise examples
* Add support for int8_t reduction (ADD/AVG, MIN/MAX/AMAX)
* Tiny fix in reduce profiler and tiny update in reduce testing scripts
* Tiny fix in testing script profile_reduce_no_index.sh
* Tiny fix in testing script profile_reduce_no_index.sh
* Add support for bfp16 reduction (using bhalf_t = ushort)
* Tiny fix in amd_buffer_addressing.hpp
* Tiny change in script/profile_reduce_with_index.sh
* Use AccDataType for Beta value and use element_wise::PassThrough
* Use type_convert for type converting in host layer reduction
* Renaming and refining in Reduction profiler/device layer/examples
* Renaming and refining in Reduction profiler/device layer/examples
* Renaming all NumReduceDims to NumReduceDim
* Fix the leaked type_convert in ThreadwiseTensorSliceTransfer_v2
* Update to testing scripts to add bf16 support
* added more static_assert
* Remove buggy tunable configurations defined in device_reduce_instance_xxx.hpp
* Add static_assert to give compile-time warning for incorrect thread slice-size/vector-size configurations
* minor change
* Refine and fix (in GetWorkspaceSizeInBytes of MultiBlockPartialReduce) to make int8 completely pass
* Tiny renaming in gridwise_2d_reduction_multiblock_partial_reduce.hpp
* Tiny fix in script/profile_reduce_no_index.sh
* Refine in DeviceReduce layer with regard to using NumInvariantDim/NumReduceDim or InvariantDims/ReduceDims
* Generic renaming in host reduction and DeviceReduce layer
* Add support for 4-d all dimension reduction in the profiler and add_device_reduce_xxx instances
* Use multi-thread and simplification for host Reduction implementation
* Add ctest for reduction
* Update to clarify the using of data init method in produce_reduce/example_reduce/test_reduce/
* Update to the reduce CTest executables to enable default testing behavior when no command argument
* Renaming
Co-authored-by: Jianfeng yan <jfyan008@gmail.com>
[ROCm/composable_kernel commit: 9a8ee8a39a]
74 lines
4.5 KiB
Bash
Executable File
74 lines
4.5 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
PRECISION=
|
|
##PRECISION=--half
|
|
##PRECISION=--double
|
|
##PRECISION=--int8
|
|
##PRECISION=--bf16
|
|
|
|
driver="./bin/ckProfiler"
|
|
|
|
VERIFY="-v $1"
|
|
INIT=$2
|
|
NREPEAT=$3
|
|
|
|
#### 2 - MIN, 3 - MAX, 4 - AMAX
|
|
Operations="2 4"
|
|
|
|
## for generic validation
|
|
for op in $Operations; do
|
|
for use_idx in 0 1; do
|
|
set -x
|
|
####### datatype layout reduce dims op use index verify init repeats
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 0,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,22960 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,22960 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 4,1469440 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 4,1469440 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
set +x
|
|
done
|
|
done
|
|
|
|
Operations=2
|
|
|
|
## for performance evaluation (resnet50 NHWC => C)
|
|
for op in $Operations; do
|
|
for use_idx in 0 1; do
|
|
set -x
|
|
####### datatype layout reduce dims op use index verify init repeats
|
|
$driver reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
$driver reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
|
set +x
|
|
done
|
|
done
|
|
|