mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-08 15:30:23 +00:00
Reduction for int8 and bfloat16 (#125)
* Use thread cluster descriptor and explicit M_K 2d descriptor to simply Blockwise Reduction * Change by replacing ReduceDims by NumReduceDims as Device Reduce interface template parameter * Rename the folder name for the pool2d and reduce examples * Update to reduction test scripts * Add Readme for pool2d_fwd and reduce_blockwise examples * Add support for int8_t reduction (ADD/AVG, MIN/MAX/AMAX) * Tiny fix in reduce profiler and tiny update in reduce testing scripts * Tiny fix in testing script profile_reduce_no_index.sh * Tiny fix in testing script profile_reduce_no_index.sh * Add support for bfp16 reduction (using bhalf_t = ushort) * Tiny fix in amd_buffer_addressing.hpp * Tiny change in script/profile_reduce_with_index.sh * Use AccDataType for Beta value and use element_wise::PassThrough * Use type_convert for type converting in host layer reduction * Renaming and refining in Reduction profiler/device layer/examples * Renaming and refining in Reduction profiler/device layer/examples * Renaming all NumReduceDims to NumReduceDim * Fix the leaked type_convert in ThreadwiseTensorSliceTransfer_v2 * Update to testing scripts to add bf16 support * added more static_assert * Remove buggy tunable configurations defined in device_reduce_instance_xxx.hpp * Add static_assert to give compile-time warning for incorrect thread slice-size/vector-size configurations * minor change * Refine and fix (in GetWorkspaceSizeInBytes of MultiBlockPartialReduce) to make int8 completely pass * Tiny renaming in gridwise_2d_reduction_multiblock_partial_reduce.hpp * Tiny fix in script/profile_reduce_no_index.sh * Refine in DeviceReduce layer with regard to using NumInvariantDim/NumReduceDim or InvariantDims/ReduceDims * Generic renaming in host reduction and DeviceReduce layer * Add support for 4-d all dimension reduction in the profiler and add_device_reduce_xxx instances * Use multi-thread and simplification for host Reduction implementation * Add ctest for reduction * Update to clarify the using of data init method in produce_reduce/example_reduce/test_reduce/ * Update to the reduce CTest executables to enable default testing behavior when no command argument * Renaming Co-authored-by: Jianfeng yan <jfyan008@gmail.com>
This commit is contained in:
@@ -3,14 +3,14 @@ rm -f CMakeCache.txt
|
||||
rm -f *.cmake
|
||||
rm -rf CMakeFiles
|
||||
|
||||
MY_PROJECT_SOURCE=../../..
|
||||
MY_PROJECT_SOURCE=../
|
||||
MY_PROJECT_INSTALL=../install.dir
|
||||
|
||||
cmake \
|
||||
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
|
||||
@@ -3,13 +3,16 @@
|
||||
PRECISION=
|
||||
##PRECISION=--half
|
||||
##PRECISION=--double
|
||||
##PRECISION=--int8
|
||||
##PRECISION=--bf16
|
||||
|
||||
if test -n $PRECISION && test "$PRECISION" = "--half"; then
|
||||
if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then
|
||||
ACCTYPE="-C 1"
|
||||
else
|
||||
ACCTYPE=""
|
||||
elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
|
||||
ACCTYPE="-C 2"
|
||||
fi
|
||||
|
||||
|
||||
driver="./bin/ckProfiler"
|
||||
|
||||
VERIFY="-v $1"
|
||||
@@ -20,10 +23,16 @@ NREPEAT=$3
|
||||
#### 0 - ADD, 5 - AVG, 7 - NORM2
|
||||
Operations="0 5 7"
|
||||
|
||||
#### 0 - ADD, 5 - AVG, for int8, no NORM2 supported
|
||||
if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
|
||||
Operations=5
|
||||
fi
|
||||
|
||||
## for generic validation
|
||||
for op in $Operations; do
|
||||
set -x
|
||||
####### datatype layout reduce dims op acctype verify init repeats
|
||||
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
PRECISION=
|
||||
##PRECISION=--half
|
||||
##PRECISION=--double
|
||||
##PRECISION=--int8
|
||||
##PRECISION=--bf16
|
||||
|
||||
driver="./bin/ckProfiler"
|
||||
|
||||
@@ -18,6 +20,7 @@ for op in $Operations; do
|
||||
for use_idx in 0 1; do
|
||||
set -x
|
||||
####### datatype layout reduce dims op use index verify init repeats
|
||||
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
|
||||
52
script/test_reduce_no_index.sh
Executable file
52
script/test_reduce_no_index.sh
Executable file
@@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
|
||||
## The following will be used for CI
|
||||
|
||||
set -x
|
||||
|
||||
## for float
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2
|
||||
|
||||
## for float16
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 1 2
|
||||
|
||||
## for int8_t
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 3 2
|
||||
|
||||
## for bfloat16
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 5 2
|
||||
|
||||
set +x
|
||||
|
||||
52
script/test_reduce_with_index.sh
Executable file
52
script/test_reduce_with_index.sh
Executable file
@@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
|
||||
## The following will be used for CI
|
||||
|
||||
set -x
|
||||
|
||||
## for float
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 0 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 0 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 0 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 0 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 0 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0 0 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2
|
||||
|
||||
## for float16
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 1 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 1 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 1 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0 1 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 1 1 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 2 1 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 3 1 2
|
||||
|
||||
## for int8_t
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 3 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 3 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 3 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 3 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 3 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0 3 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 1 3 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 2 3 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 3 3 2
|
||||
|
||||
## for bfloat16
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 5 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 5 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 5 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 5 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 5 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 0 5 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 1 5 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 2 5 2
|
||||
bin/test_reduce_with_index -D 64,4,280,82 -R 3 5 2
|
||||
|
||||
set +x
|
||||
|
||||
Reference in New Issue
Block a user