Reduction for int8 and bfloat16 (#125)

* Use thread cluster descriptor and explicit M_K 2d descriptor to simply Blockwise Reduction * Change by replacing ReduceDims by NumReduceDims as Device Reduce interface template parameter * Rename the folder name for the pool2d and reduce examples * Update to reduction test scripts * Add Readme for pool2d_fwd and reduce_blockwise examples * Add support for int8_t reduction (ADD/AVG, MIN/MAX/AMAX) * Tiny fix in reduce profiler and tiny update in reduce testing scripts * Tiny fix in testing script profile_reduce_no_index.sh * Tiny fix in testing script profile_reduce_no_index.sh * Add support for bfp16 reduction (using bhalf_t = ushort) * Tiny fix in amd_buffer_addressing.hpp * Tiny change in script/profile_reduce_with_index.sh * Use AccDataType for Beta value and use element_wise::PassThrough * Use type_convert for type converting in host layer reduction * Renaming and refining in Reduction profiler/device layer/examples * Renaming and refining in Reduction profiler/device layer/examples * Renaming all NumReduceDims to NumReduceDim * Fix the leaked type_convert in ThreadwiseTensorSliceTransfer_v2 * Update to testing scripts to add bf16 support * added more static_assert * Remove buggy tunable configurations defined in device_reduce_instance_xxx.hpp * Add static_assert to give compile-time warning for incorrect thread slice-size/vector-size configurations * minor change * Refine and fix (in GetWorkspaceSizeInBytes of MultiBlockPartialReduce) to make int8 completely pass * Tiny renaming in gridwise_2d_reduction_multiblock_partial_reduce.hpp * Tiny fix in script/profile_reduce_no_index.sh * Refine in DeviceReduce layer with regard to using NumInvariantDim/NumReduceDim or InvariantDims/ReduceDims * Generic renaming in host reduction and DeviceReduce layer * Add support for 4-d all dimension reduction in the profiler and add_device_reduce_xxx instances * Use multi-thread and simplification for host Reduction implementation * Add ctest for reduction * Update to clarify the using of data init method in produce_reduce/example_reduce/test_reduce/ * Update to the reduce CTest executables to enable default testing behavior when no command argument * Renaming Co-authored-by: Jianfeng yan <jfyan008@gmail.com> [ROCm/composable_kernel commit: 9a8ee8a39a]
2026-05-24 06:44:36 +00:00 · 2022-03-23 03:35:14 +08:00
parent aa4c28d53b
commit 8ffa0717f9
113 changed files with 4036 additions and 973 deletions
--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -3,14 +3,14 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles

-MY_PROJECT_SOURCE=../../..
+MY_PROJECT_SOURCE=../
 MY_PROJECT_INSTALL=../install.dir

 cmake                                                                                                                                          \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
 -D BUILD_DEV=OFF                                                                                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only "   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
--- a/script/profile_reduce_no_index.sh
+++ b/script/profile_reduce_no_index.sh
@@ -3,13 +3,16 @@
 PRECISION=
 ##PRECISION=--half
 ##PRECISION=--double
+##PRECISION=--int8
+##PRECISION=--bf16

-if test -n $PRECISION && test "$PRECISION" = "--half"; then 
+if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then
   ACCTYPE="-C 1"
-else
-   ACCTYPE=""
+elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
+   ACCTYPE="-C 2"
 fi

+
 driver="./bin/ckProfiler"

 VERIFY="-v $1"
@@ -20,10 +23,16 @@ NREPEAT=$3
 #### 0 - ADD,  5 - AVG,  7 - NORM2
 Operations="0 5 7"

+#### 0 - ADD,  5 - AVG,    for int8, no NORM2 supported
+if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
+   Operations=5
+fi
+
 ## for generic validation
 for op in $Operations; do
    set -x
    #######        datatype   layout          reduce dims  op     acctype   verify  init  repeats
+    $driver reduce $PRECISION -D 64,4,280,82  -R 0,1,2,3   -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
    $driver reduce $PRECISION -D 64,4,280,82  -R 0         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
    $driver reduce $PRECISION -D 64,4,280,82  -R 1         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
    $driver reduce $PRECISION -D 64,4,280,82  -R 2         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
--- a/script/profile_reduce_with_index.sh
+++ b/script/profile_reduce_with_index.sh
@@ -3,6 +3,8 @@
 PRECISION=
 ##PRECISION=--half
 ##PRECISION=--double
+##PRECISION=--int8
+##PRECISION=--bf16

 driver="./bin/ckProfiler"

@@ -18,6 +20,7 @@ for op in $Operations; do
    for use_idx in 0 1; do
        set -x
        #######        datatype   layout          reduce dims  op     use index    verify  init  repeats
+        $driver reduce $PRECISION -D 64,4,280,82  -R 0,1,2,3   -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
        $driver reduce $PRECISION -D 64,4,280,82  -R 0         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
        $driver reduce $PRECISION -D 64,4,280,82  -R 1         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
        $driver reduce $PRECISION -D 64,4,280,82  -R 2         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
--- a/script/test_reduce_no_index.sh
+++ b/script/test_reduce_no_index.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+## The following will be used for CI
+
+set -x
+
+## for float
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  0 2
+
+## for float16
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  1 2
+
+## for int8_t
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  3 2
+
+## for bfloat16
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  5 2
+
+set +x
+
--- a/script/test_reduce_with_index.sh
+++ b/script/test_reduce_with_index.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+## The following will be used for CI
+
+set -x
+
+## for float
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
+
+## for float16
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  1 2
+
+## for int8_t
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  3 2
+
+## for bfloat16
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  5 2
+
+set +x
+