mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Compile for gfx908 and gfx90a (#130)
* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
This commit is contained in:
@@ -1,45 +1,12 @@
|
||||
# Instructions for ```reduce_blockwise``` Example
|
||||
# Instructions for ```example_reduce_blockwise```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```reduce_blockwise```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j reduce_blockwise
|
||||
```
|
||||
|
||||
## Run ```reduce_blockwise```
|
||||
## Run ```example_reduce_blockwise```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg2: run kernel # of times (>1)
|
||||
./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
```
|
||||
|
||||
Result
|
||||
@@ -50,7 +17,7 @@ Start running 3 times...
|
||||
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
|
||||
error: 0
|
||||
max_diff: 0, 529, 529
|
||||
root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 10 times...
|
||||
|
||||
@@ -32,10 +32,10 @@ using HostAccDataType = float;
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
|
||||
constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES;
|
||||
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
|
||||
constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
|
||||
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
@@ -210,11 +210,11 @@ int main(int argc, char* argv[])
|
||||
return (-1);
|
||||
|
||||
constexpr bool op_support_indices =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
|
||||
constexpr bool NeedIndices =
|
||||
(op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES));
|
||||
(op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
|
||||
|
||||
// if input is half type, no reason to use float for indiced reduction operation and must use
|
||||
// float for non-indiced reduction operation for accuracy
|
||||
@@ -230,7 +230,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
// indices option can only be used when it is really needed
|
||||
constexpr bool invalid_reduce_3 =
|
||||
(!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
(!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user