Pr82 followup (#115)

* Use thread cluster descriptor and explicit M_K 2d descriptor to simply Blockwise Reduction * Change by replacing ReduceDims by NumReduceDims as Device Reduce interface template parameter * Rename the folder name for the pool2d and reduce examples * Update to reduction test scripts * Add Readme for pool2d_fwd and reduce_blockwise examples * Tiny fix in reduce profiler and tiny update in reduce testing scripts * Tiny fix in testing script profile_reduce_no_index.sh * Tiny change in script/profile_reduce_with_index.sh * Renaming and refining in Reduction profiler/device layer/examples * Renaming and refining in Reduction profiler/device layer/examples * Renaming all NumReduceDims to NumReduceDim [ROCm/composable_kernel commit: 827301d95a]
2026-05-19 12:30:16 +00:00 · 2022-03-11 00:14:43 +08:00
parent 82ad74304e
commit bc3b1d6fe7
70 changed files with 1704 additions and 1576 deletions
--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -0,0 +1,60 @@
+# Instructions for ```reduce_blockwise``` Example
+
+## Docker script
+```bash
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
+rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
+/bin/bash
+```
+
+## Build ```reduce_blockwise```
+```bash
+mkdir build && cd build
+```
+
+```bash
+# Need to specify target ID, example below is gfx908
+cmake                                                                  \
+-D BUILD_DEV=OFF                                                       \
+-D CMAKE_BUILD_TYPE=Release                                            \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
+..
+```
+
+```bash
+ make -j reduce_blockwise 
+```
+
+## Run ```reduce_blockwise```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg2: run kernel # of times (>1)
+./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
+```
+
+Result
+```
+launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 3 times...
+Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
+error: 0
+max_diff: 0, 529, 529
+root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
+launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 10 times...
+Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
+error: 0
+max_diff: 0, 528, 528
+```
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -14,6 +14,7 @@
 #include "device_reduce_blockwise.hpp"
 #include "host_reduce_util.hpp"
 #include "host_generic_reduction.hpp"
+
 #include "reduction_enums.hpp"
 #include "reduction_operator_mapping.hpp"

@@ -28,8 +29,8 @@ using kInDataType  = ck::half_t;
 using kOutDataType = ck::half_t;
 using kAccDataType = float;

-constexpr int Rank = 4;
-using ReduceDims_  = ck::Sequence<0, 1, 2>;
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;

 constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
 constexpr NanPropagation_t NanOpt     = NanPropagation_t::PROPAGATE_NAN;
@@ -46,7 +47,7 @@ using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
                                                   kAccDataType,
                                                   kOutDataType,
                                                   Rank,
-                                                   ReduceDims_,
+                                                   NumReduceDim,
                                                   ReduceOperation,
                                                   InElementwiseOperation,
                                                   AccElementwiseOperation,
@@ -192,39 +193,13 @@ class SimpleAppArgs
    };
 };

-template <int Rank, typename ReduceDims>
-static std::vector<int> get_reduce_dims()
-{
-    std::vector<int> resDims;
-
-    static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); });
-
-    return (resDims);
-};
-
-template <int Rank, typename ReduceDims>
-static std::vector<int> get_invariant_dims()
-{
-    std::vector<int> resDims;
-    unsigned int incFlag = 0;
-
-    static_for<0, ReduceDims::Size(), 1>{}(
-        [&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); });
-
-    for(int dim = 0; dim < Rank; dim++)
-    {
-        if(incFlag & (0x1 << dim))
-            continue;
-        resDims.push_back(dim);
-    };
-
-    return (resDims);
-};
-
 int main(int argc, char* argv[])
 {
    using namespace ck::host_reduce;

+    const std::vector<int> reduceDims{0, 1, 2};
+    const std::vector<int> invariantDims{3};
+
    SimpleAppArgs args;

    if(args.processArgs(argc, argv) < 0)
@@ -260,15 +235,12 @@ int main(int argc, char* argv[])

    Tensor<InDataType> in(args.inLengths);

-    const std::vector<int> InvariantDims = get_invariant_dims<Rank, ReduceDims_>();
-    const std::vector<int> ReduceDims    = get_reduce_dims<Rank, ReduceDims_>();
-
    std::vector<size_t> outLengths;

-    if(InvariantDims.empty())
+    if(invariantDims.empty())
        outLengths.push_back(1);
    else
-        for(auto dim : InvariantDims)
+        for(auto dim : invariantDims)
            outLengths.push_back(args.inLengths[dim]);

    Tensor<OutDataType> out_ref(outLengths);
@@ -328,7 +300,7 @@ int main(int argc, char* argv[])
    if(args.do_verification)
    {
        ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices>
-            hostReduce(in.mDesc, out_ref.mDesc, InvariantDims, ReduceDims);
+            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);

        hostReduce.Run(
            alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
@@ -350,6 +322,7 @@ int main(int argc, char* argv[])
                                   i_inStrides,
                                   i_outLengths,
                                   i_outStrides,
+                                   reduceDims,
                                   alpha,
                                   beta,
                                   in_dev.GetDeviceBuffer(),
--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
@@ -0,0 +1,55 @@
+# Instructions for ```pool2d_fwd``` Example
+
+## Docker script
+```bash
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
+rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
+/bin/bash
+```
+
+## Build ```pool2d_fwd```
+```bash
+mkdir build && cd build
+```
+
+```bash
+# Need to specify target ID, example below is gfx908
+cmake                                                                  \
+-D BUILD_DEV=OFF                                                       \
+-D CMAKE_BUILD_TYPE=Release                                            \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
+..
+```
+
+```bash
+ make -j pool2d_fwd
+```
+
+## Run ```pool2d_fwd```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
+./example/pool2d_fwd 1 1 10
+```
+
+Result 
+```
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
+launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} 
+Warm up
+Start running 10 times...
+Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
+error: 0
+max_diff: 0, 1, 1
+```