Compile for gfx908 and gfx90a (#130)

* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
2026-04-19 22:39:03 +00:00 · 2022-03-31 12:33:34 -05:00
parent ecf337bab5
commit cd167e492a
227 changed files with 1398 additions and 2944 deletions
--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -1,45 +1,12 @@
-# Instructions for ```reduce_blockwise``` Example
+# Instructions for ```example_reduce_blockwise```

-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-
-## Build ```reduce_blockwise```
-```bash
-mkdir build && cd build
-```
-
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-
-```bash
- make -j reduce_blockwise 
-```
-
-## Run ```reduce_blockwise```
+## Run ```example_reduce_blockwise```
 ```bash
 # -D <xxx> : input 4-d tensor lengths
 # -v <x> :   verification (0=no, 1=yes)
 #arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg2: run kernel # of times (>1)
-./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
+./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
 ```

 Result
@@ -50,7 +17,7 @@ Start running 3 times...
 Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
 error: 0
 max_diff: 0, 529, 529
-root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
+root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
 launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
 Warm up
 Start running 10 times...
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -32,10 +32,10 @@ using HostAccDataType = float;
 constexpr int Rank         = 4;
 constexpr int NumReduceDim = 3;

-constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
-constexpr NanPropagation_t NanOpt     = NanPropagation_t::PROPAGATE_NAN;
-constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
-constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES;
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
+constexpr NanPropagation NanOpt     = NanPropagation::PROPAGATE_NAN;
+constexpr bool PropagateNan         = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
+constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;

 using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
 using InElementwiseOperation =
@@ -210,11 +210,11 @@ int main(int argc, char* argv[])
        return (-1);

    constexpr bool op_support_indices =
-        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
-         ReduceOpId == ReduceTensorOp_t::AMAX);
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
+         ReduceOpId == ReduceTensorOp::AMAX);

    constexpr bool NeedIndices =
-        (op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES));
+        (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));

    // if input is half type, no reason to use float for indiced reduction operation and must use
    // float for non-indiced reduction operation for accuracy
@@ -230,7 +230,7 @@ int main(int argc, char* argv[])

    // indices option can only be used when it is really needed
    constexpr bool invalid_reduce_3 =
-        (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+        (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);

    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);