Compile for gfx908 and gfx90a (#130)

* adding compilation for multiple targets

* fix build

* clean

* update Jekinsfile

* update readme

* update Jenkins

* use ck::half_t instead of ushort for bf16

* rename enum classes

* clean

* rename

* clean
This commit is contained in:
Chao Liu
2022-03-31 12:33:34 -05:00
committed by GitHub
parent ecf337bab5
commit cd167e492a
227 changed files with 1398 additions and 2944 deletions

View File

@@ -1,45 +1,12 @@
# Instructions for ```reduce_blockwise``` Example
# Instructions for ```example_reduce_blockwise```
## Docker script
```bash
docker run \
-it \
--rm \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
/bin/bash
```
## Build ```reduce_blockwise```
```bash
mkdir build && cd build
```
```bash
# Need to specify target ID, example below is gfx908
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
```
```bash
make -j reduce_blockwise
```
## Run ```reduce_blockwise```
## Run ```example_reduce_blockwise```
```bash
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: run kernel # of times (>1)
./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
```
Result
@@ -50,7 +17,7 @@ Start running 3 times...
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 529, 529
root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 10 times...

View File

@@ -32,10 +32,10 @@ using HostAccDataType = float;
constexpr int Rank = 4;
constexpr int NumReduceDim = 3;
constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN;
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN;
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using InElementwiseOperation =
@@ -210,11 +210,11 @@ int main(int argc, char* argv[])
return (-1);
constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX);
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices =
(op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES));
(op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
// if input is half type, no reason to use float for indiced reduction operation and must use
// float for non-indiced reduction operation for accuracy
@@ -230,7 +230,7 @@ int main(int argc, char* argv[])
// indices option can only be used when it is really needed
constexpr bool invalid_reduce_3 =
(!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
(!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);