mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-19 12:30:16 +00:00
Pr82 followup (#115)
* Use thread cluster descriptor and explicit M_K 2d descriptor to simply Blockwise Reduction
* Change by replacing ReduceDims by NumReduceDims as Device Reduce interface template parameter
* Rename the folder name for the pool2d and reduce examples
* Update to reduction test scripts
* Add Readme for pool2d_fwd and reduce_blockwise examples
* Tiny fix in reduce profiler and tiny update in reduce testing scripts
* Tiny fix in testing script profile_reduce_no_index.sh
* Tiny change in script/profile_reduce_with_index.sh
* Renaming and refining in Reduction profiler/device layer/examples
* Renaming and refining in Reduction profiler/device layer/examples
* Renaming all NumReduceDims to NumReduceDim
[ROCm/composable_kernel commit: 827301d95a]
This commit is contained in:
60
example/12_reduce/README.md
Normal file
60
example/12_reduce/README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# Instructions for ```reduce_blockwise``` Example
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```reduce_blockwise```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j reduce_blockwise
|
||||
```
|
||||
|
||||
## Run ```reduce_blockwise```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg2: run kernel # of times (>1)
|
||||
./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 3 times...
|
||||
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
|
||||
error: 0
|
||||
max_diff: 0, 529, 529
|
||||
root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 10 times...
|
||||
Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
|
||||
error: 0
|
||||
max_diff: 0, 528, 528
|
||||
```
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "device_reduce_blockwise.hpp"
|
||||
#include "host_reduce_util.hpp"
|
||||
#include "host_generic_reduction.hpp"
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
|
||||
@@ -28,8 +29,8 @@ using kInDataType = ck::half_t;
|
||||
using kOutDataType = ck::half_t;
|
||||
using kAccDataType = float;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
using ReduceDims_ = ck::Sequence<0, 1, 2>;
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
|
||||
constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN;
|
||||
@@ -46,7 +47,7 @@ using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
|
||||
kAccDataType,
|
||||
kOutDataType,
|
||||
Rank,
|
||||
ReduceDims_,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
@@ -192,39 +193,13 @@ class SimpleAppArgs
|
||||
};
|
||||
};
|
||||
|
||||
template <int Rank, typename ReduceDims>
|
||||
static std::vector<int> get_reduce_dims()
|
||||
{
|
||||
std::vector<int> resDims;
|
||||
|
||||
static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); });
|
||||
|
||||
return (resDims);
|
||||
};
|
||||
|
||||
template <int Rank, typename ReduceDims>
|
||||
static std::vector<int> get_invariant_dims()
|
||||
{
|
||||
std::vector<int> resDims;
|
||||
unsigned int incFlag = 0;
|
||||
|
||||
static_for<0, ReduceDims::Size(), 1>{}(
|
||||
[&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); });
|
||||
|
||||
for(int dim = 0; dim < Rank; dim++)
|
||||
{
|
||||
if(incFlag & (0x1 << dim))
|
||||
continue;
|
||||
resDims.push_back(dim);
|
||||
};
|
||||
|
||||
return (resDims);
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::host_reduce;
|
||||
|
||||
const std::vector<int> reduceDims{0, 1, 2};
|
||||
const std::vector<int> invariantDims{3};
|
||||
|
||||
SimpleAppArgs args;
|
||||
|
||||
if(args.processArgs(argc, argv) < 0)
|
||||
@@ -260,15 +235,12 @@ int main(int argc, char* argv[])
|
||||
|
||||
Tensor<InDataType> in(args.inLengths);
|
||||
|
||||
const std::vector<int> InvariantDims = get_invariant_dims<Rank, ReduceDims_>();
|
||||
const std::vector<int> ReduceDims = get_reduce_dims<Rank, ReduceDims_>();
|
||||
|
||||
std::vector<size_t> outLengths;
|
||||
|
||||
if(InvariantDims.empty())
|
||||
if(invariantDims.empty())
|
||||
outLengths.push_back(1);
|
||||
else
|
||||
for(auto dim : InvariantDims)
|
||||
for(auto dim : invariantDims)
|
||||
outLengths.push_back(args.inLengths[dim]);
|
||||
|
||||
Tensor<OutDataType> out_ref(outLengths);
|
||||
@@ -328,7 +300,7 @@ int main(int argc, char* argv[])
|
||||
if(args.do_verification)
|
||||
{
|
||||
ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, InvariantDims, ReduceDims);
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(
|
||||
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
|
||||
@@ -350,6 +322,7 @@ int main(int argc, char* argv[])
|
||||
i_inStrides,
|
||||
i_outLengths,
|
||||
i_outStrides,
|
||||
reduceDims,
|
||||
alpha,
|
||||
beta,
|
||||
in_dev.GetDeviceBuffer(),
|
||||
|
||||
55
example/13_pool2d_fwd/README.md
Normal file
55
example/13_pool2d_fwd/README.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# Instructions for ```pool2d_fwd``` Example
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```pool2d_fwd```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j pool2d_fwd
|
||||
```
|
||||
|
||||
## Run ```pool2d_fwd```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
|
||||
./example/pool2d_fwd 1 1 10
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
|
||||
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
|
||||
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
|
||||
Warm up
|
||||
Start running 10 times...
|
||||
Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
|
||||
error: 0
|
||||
max_diff: 0, 1, 1
|
||||
```
|
||||
Reference in New Issue
Block a user