mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 19:40:04 +00:00
Compile for gfx908 and gfx90a (#130)
* adding compilation for multiple targets
* fix build
* clean
* update Jekinsfile
* update readme
* update Jenkins
* use ck::half_t instead of ushort for bf16
* rename enum classes
* clean
* rename
* clean
[ROCm/composable_kernel commit: cd167e492a]
This commit is contained in:
@@ -1,44 +1,11 @@
|
||||
# Instructions for ```gemm_xdl``` Example
|
||||
# Instructions for ```example_gemm_xdl```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```gemm_xdl```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j gemm_xdl
|
||||
```
|
||||
|
||||
## Run ```gemm_xdl```
|
||||
## Run ```example_gemm_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
./example/gemm_xdl 0 1 5
|
||||
./bin/example_gemm_xdl 0 1 5
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
|
||||
@@ -40,7 +40,7 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
// clang-format off
|
||||
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
|
||||
|
||||
@@ -1,44 +1,11 @@
|
||||
# Instructions for ```gemm_xdl_alpha_beta``` Example
|
||||
# Instructions for ```example_gemm_xdl_alpha_beta```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```gemm_xdl_alpha_beta```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j gemm_xdl_alpha_beta
|
||||
```
|
||||
|
||||
## Run ```gemm_xdl_alpha_beta```
|
||||
## Run ```example_gemm_xdl_alpha_beta```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
./example/gemm_xdl_alpha_beta 1 1 1 0.5 0.5
|
||||
./bin/example_gemm_xdl_alpha_beta 1 1 1 0.5 0.5
|
||||
```
|
||||
Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16)
|
||||
```
|
||||
|
||||
@@ -1,45 +1,12 @@
|
||||
# Instructions for ```gemm_xdl_bias_relu_add``` Example
|
||||
# Instructions for ```example_gemm_xdl_bias_relu_add```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```gemm_xdl_bias_relu_add```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j gemm_xdl_bias_relu_add
|
||||
```
|
||||
|
||||
## Run ```gemm_xdl_bias_relu_add```
|
||||
## Run ```example_gemm_xdl_bias_relu_add```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
|
||||
./example/gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
|
||||
./bin/example_gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
|
||||
@@ -1,45 +1,12 @@
|
||||
# Instructions for ```gemm_xdl_bias_relu_add``` Example
|
||||
# Instructions for ```example_gemm_xdl_bias_relu_add```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```gemm_xdl_bias_relu_add```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j gemm_xdl_bias_relu_add
|
||||
```
|
||||
|
||||
## Run ```gemm_xdl_bias_relu_add```
|
||||
## Run ```example_gemm_xdl_bias_relu_add```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
|
||||
./example/gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
|
||||
./bin/example_gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
|
||||
@@ -1,45 +1,12 @@
|
||||
# Instructions for ```conv2d_fwd_xdl``` Example
|
||||
# Instructions for ```example_conv2d_fwd_xdl```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```conv2d_fwd_xdl```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j conv2d_fwd_xdl
|
||||
```
|
||||
|
||||
## Run ```conv2d_fwd_xdl```
|
||||
## Run ```example_conv2d_fwd_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
|
||||
./example/conv2d_fwd_xdl 0 1 5
|
||||
./bin/example_conv2d_fwd_xdl 0 1 5
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
|
||||
@@ -34,7 +34,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConvFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
|
||||
@@ -35,7 +35,7 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConvFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
|
||||
@@ -1,45 +1,12 @@
|
||||
# Instructions for ```conv_xdl_bias_relu_add``` Example
|
||||
# Instructions for ```example_conv_xdl_bias_relu```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```conv_xdl_bias_relu_add```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j conv_xdl_bias_relu_add
|
||||
```
|
||||
|
||||
## Run ```conv_xdl_bias_relu_add```
|
||||
## Run ```example_conv_xdl_bias_relu```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
|
||||
./example/conv_xdl_bias_relu_add 0 1 5
|
||||
./bin/example_conv_xdl_bias_relu 0 1 5
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
@@ -48,14 +15,8 @@ in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
|
||||
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
|
||||
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
|
||||
bias_k: dim 1, lengths {256}, strides {1}
|
||||
resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
|
||||
arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
|
||||
arg.b_grid_desc_k0_n_k1_{216, 256, 8}
|
||||
arg.c_grid_desc_m_n_{ 165888, 256}
|
||||
arg.c0_grid_desc_m_n_{ 165888, 256}
|
||||
arg.c1_grid_desc_m_n_{ 165888, 256}
|
||||
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 5 times...
|
||||
Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
|
||||
Perf: 1.39009 ms, 105.581 TFlops, 239.981 GB/s
|
||||
```
|
||||
|
||||
@@ -32,10 +32,10 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
|
||||
|
||||
static constexpr auto MemorySet = ck::InMemoryDataOperationEnum_t::Set;
|
||||
static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
// clang-format off
|
||||
using DeviceConvFwdInstance = ck::tensor_operation::device::
|
||||
|
||||
@@ -1,45 +1,13 @@
|
||||
# Instructions for ```conv_xdl_bias_relu_add``` Example
|
||||
# Instructions for ```example_conv_xdl_bias_relu_add```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```conv_xdl_bias_relu_add```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j conv_xdl_bias_relu_add
|
||||
```
|
||||
|
||||
## Run ```conv_xdl_bias_relu_add```
|
||||
## Run ```example_conv_xdl_bias_relu_add```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
|
||||
./example/conv_xdl_bias_relu_add 0 1 5
|
||||
./bin/example_conv_xdl_bias_relu_add 0 1 5
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
@@ -49,13 +17,8 @@ wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
|
||||
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
|
||||
bias_k: dim 1, lengths {256}, strides {1}
|
||||
resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
|
||||
arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
|
||||
arg.b_grid_desc_k0_n_k1_{216, 256, 8}
|
||||
arg.c_grid_desc_m_n_{ 165888, 256}
|
||||
arg.c0_grid_desc_m_n_{ 165888, 256}
|
||||
arg.c1_grid_desc_m_n_{ 165888, 256}
|
||||
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 5 times...
|
||||
Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
|
||||
Perf: 1.44711 ms, 101.421 TFlops, 289.218 GB/s
|
||||
```
|
||||
|
||||
@@ -33,7 +33,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
// clang-format off
|
||||
using DeviceConvFwdInstance = ck::tensor_operation::device::
|
||||
|
||||
@@ -1,57 +1,24 @@
|
||||
# Instructions for ```conv3d_fwd_xdl``` Example
|
||||
# Instructions for ```example_conv3d_fwd_xdl```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```conv3d_fwd_xdl```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j conv3d_fwd_xdl
|
||||
```
|
||||
|
||||
## Run ```conv3d_fwd_xdl```
|
||||
## Run ```example_conv3d_fwd_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sx, Dz, Dy, Dx, leftPz, LeftPy, LeftPx, RightPz, RightPy, RightPx
|
||||
./example/conv3d_fwd_xdl 0 1 5
|
||||
./bin/example_conv3d_fwd_xdl 0 1 5
|
||||
```
|
||||
|
||||
Result (MI100 dynamic frequency)
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
```
|
||||
in: dim 5, lengths {4, 71, 71, 71, 192}, strides {68718912, 967872, 13632, 192, 1}
|
||||
wei: dim 5, lengths {256, 3, 3, 3, 192}, strides {5184, 1728, 576, 192, 1}
|
||||
out: dim 5, lengths {4, 36, 36, 36, 256}, strides {11943936, 331776, 9216, 256, 1}
|
||||
a_grid_desc_b_k0_m_k1{1, 648, 186624, 8}
|
||||
b_grid_desc_b_k0_n_k1{1, 648, 256, 8}
|
||||
num_batches_of_GEMM = 1
|
||||
a_grid_desc_k0_m_k1{648, 186624, 8}
|
||||
b_grid_desc_k0_n_k1{648, 256, 8}
|
||||
c_grid_desc_m_n{ 186624, 256}
|
||||
launch_and_time_kernel: grid_dim {1458, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 5 times...
|
||||
Perf: 4.49466 ms, 110.206 TFlops, 144.161 GB/s
|
||||
Perf: 4.58795 ms, 107.965 TFlops, 141.23 GB/s
|
||||
```
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ using WeiLayout = ck::tensor_layout::convolution::KZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWK;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConv3dFwdInstance = ck::tensor_operation::device::
|
||||
DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<
|
||||
|
||||
@@ -1,39 +1,6 @@
|
||||
# Instructions for ```convnd_fwd_xdl``` Example
|
||||
# Instructions for ```example_convnd_fwd_xdl```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```convnd_fwd_xdl```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j convnd_fwd_xdl
|
||||
```
|
||||
|
||||
## Run ```convnd_fwd_xdl```
|
||||
## Run ```example_convnd_fwd_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
@@ -47,7 +14,7 @@ cmake \
|
||||
# <dilations>, (ie Dy, Dx for 2D)
|
||||
# <left padding>, (ie LeftPy, LeftPx for 2D)
|
||||
# <right padding>, (ie RightPy, RightPx for 2D)
|
||||
./example/convnd_fwd_xdl 0 1 100
|
||||
./bin/example_convnd_fwd_xdl 0 1 100
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32)
|
||||
|
||||
@@ -26,7 +26,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
using DeviceConvFwdBasePtr =
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
|
||||
|
||||
@@ -1,45 +1,13 @@
|
||||
# Instructions for ```conv2d_bwd_data_xdl``` Example
|
||||
# Instructions for ```example_conv2d_bwd_data_xdl``` Example
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```conv2d_bwd_data_xdl```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j conv2d_bwd_data_xdl
|
||||
```
|
||||
|
||||
## Run ```conv2d_bwd_data_xdl```
|
||||
## Run ```example_conv2d_bwd_data_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
|
||||
./bin/conv2d_bwd_data_xdl 0 1 5
|
||||
./bin/example_conv2d_bwd_data_xdl 0 1 5
|
||||
```
|
||||
|
||||
Result
|
||||
|
||||
@@ -27,7 +27,7 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
static constexpr auto ConvBwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
|
||||
ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
|
||||
|
||||
using DeviceConvBwdDataInstance = ck::tensor_operation::device::
|
||||
DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
|
||||
@@ -38,7 +38,7 @@ using DeviceConvBwdDataInstance = ck::tensor_operation::device::
|
||||
InElementOp, // InElementwiseOperation
|
||||
WeiElementOp, // WeiElementwiseOperation
|
||||
OutElementOp, // OutElementwiseOperation
|
||||
ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t
|
||||
ConvBwdDefault, // ConvolutionBackwardDataSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
|
||||
@@ -1,39 +1,6 @@
|
||||
# Instructions for ```conv2d_wrw_xdl``` Example
|
||||
# Instructions for ```example_conv2d_wrw_xdl``` Example
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```conv2d_wrw_xdl```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j conv2d_wrw_xdl
|
||||
```
|
||||
|
||||
## Run ```conv2d_wrw_xdl```
|
||||
## Run ```example_conv2d_wrw_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
|
||||
@@ -1,45 +1,12 @@
|
||||
# Instructions for ```reduce_blockwise``` Example
|
||||
# Instructions for ```example_reduce_blockwise```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```reduce_blockwise```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j reduce_blockwise
|
||||
```
|
||||
|
||||
## Run ```reduce_blockwise```
|
||||
## Run ```example_reduce_blockwise```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg2: run kernel # of times (>1)
|
||||
./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
```
|
||||
|
||||
Result
|
||||
@@ -50,7 +17,7 @@ Start running 3 times...
|
||||
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
|
||||
error: 0
|
||||
max_diff: 0, 529, 529
|
||||
root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
|
||||
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up
|
||||
Start running 10 times...
|
||||
|
||||
@@ -32,10 +32,10 @@ using HostAccDataType = float;
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
|
||||
constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES;
|
||||
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
|
||||
constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
|
||||
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
@@ -210,11 +210,11 @@ int main(int argc, char* argv[])
|
||||
return (-1);
|
||||
|
||||
constexpr bool op_support_indices =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
|
||||
constexpr bool NeedIndices =
|
||||
(op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES));
|
||||
(op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
|
||||
|
||||
// if input is half type, no reason to use float for indiced reduction operation and must use
|
||||
// float for non-indiced reduction operation for accuracy
|
||||
@@ -230,7 +230,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
// indices option can only be used when it is really needed
|
||||
constexpr bool invalid_reduce_3 =
|
||||
(!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
(!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);
|
||||
|
||||
|
||||
@@ -1,45 +1,12 @@
|
||||
# Instructions for ```pool2d_fwd``` Example
|
||||
# Instructions for ```example_pool2d_fwd``` Example
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```pool2d_fwd```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j pool2d_fwd
|
||||
```
|
||||
|
||||
## Run ```pool2d_fwd```
|
||||
## Run ```example_pool2d_fwd```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
|
||||
./example/pool2d_fwd 1 1 10
|
||||
./bin/example_pool2d_fwd 1 1 10
|
||||
```
|
||||
|
||||
Result
|
||||
|
||||
@@ -22,9 +22,9 @@ using InLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWC;
|
||||
|
||||
#if 1
|
||||
static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::MAX;
|
||||
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
|
||||
#else
|
||||
static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::AVG;
|
||||
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
|
||||
#endif
|
||||
|
||||
static constexpr bool NeedIndices = false;
|
||||
@@ -47,7 +47,7 @@ using DevicePoolFwdInstance =
|
||||
template <typename InDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
ck::ReduceTensorOp_t ReduceOpId,
|
||||
ck::ReduceTensorOp ReduceOpId,
|
||||
bool PropagateNan,
|
||||
bool NeedIndices>
|
||||
static void pool_host_verify(const Tensor<InDataType>& in,
|
||||
|
||||
@@ -1,39 +1,6 @@
|
||||
# Instructions for ```grouped_gemm_xdl``` Example
|
||||
# Instructions for ```example_grouped_gemm_xdl```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```grouped_gemm_xdl```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j example_grouped_gemm_xdl_fp16
|
||||
```
|
||||
|
||||
## Run ```grouped_gemm_xdl```
|
||||
## Run ```example_grouped_gemm_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
|
||||
@@ -40,9 +40,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
// static constexpr auto GemmMNPadding =
|
||||
// ck::tensor_operation::device::GemmSpecialization_t::MNPadding;
|
||||
// ck::tensor_operation::device::GemmSpecialization::MNPadding;
|
||||
|
||||
// clang-format off
|
||||
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl
|
||||
|
||||
@@ -40,7 +40,7 @@ using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
|
||||
using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum;
|
||||
|
||||
static constexpr auto GemmSpecialization =
|
||||
ck::tensor_operation::device::GemmSpecialization_t::Default;
|
||||
ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
// clang-format off
|
||||
using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle
|
||||
|
||||
@@ -1,46 +1,13 @@
|
||||
# Instructions for ```convnd_bwd_data_xdl``` Example
|
||||
# Instructions for ```example_convnd_bwd_data_xdl```
|
||||
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```convnd_bwd_data_xdl```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j convnd_bwd_data_xdl
|
||||
```
|
||||
|
||||
## Run ```example_convnd_bwd_data_xdl```
|
||||
## Run ```example_example_convnd_bwd_data_xdl```
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: run kernel # of times (>1)
|
||||
#arg4: num_dim_spatial(1|2|3)
|
||||
#arg5 to ...: N, K, C, [Z,] [Y,] X, [Di,] [Hi,] Wi, S[z,] [Sy,] Sx, [Dz,] [Dy,] Dx, [LeftPz,] [LeftPy,] LeftPx, [RightPy,] [RightPy,] RightPx
|
||||
./bin/convnd_bwd_data_xdl 0 1 5
|
||||
./bin/example_convnd_bwd_data_xdl 0 1 5
|
||||
```
|
||||
|
||||
Result
|
||||
|
||||
@@ -29,7 +29,7 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
static constexpr auto ConvBwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
|
||||
ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
|
||||
|
||||
using DeviceConvBwdDataBasePtr =
|
||||
ck::tensor_operation::device::DeviceConvBwdDataPtr<InElementOp, WeiElementOp, OutElementOp>;
|
||||
@@ -44,7 +44,7 @@ using DeviceConvNDBwdDataInstance = ck::tensor_operation::device::
|
||||
InElementOp, // InElementwiseOperation
|
||||
WeiElementOp, // WeiElementwiseOperation
|
||||
OutElementOp, // OutElementwiseOperation
|
||||
ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t
|
||||
ConvBwdDefault, // ConvolutionBackwardDataSpecialization
|
||||
NumDimSpatial, // NumDimSpatial
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
|
||||
@@ -40,7 +40,7 @@ using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
|
||||
using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum;
|
||||
|
||||
static constexpr auto GemmSpecialization =
|
||||
ck::tensor_operation::device::GemmSpecialization_t::Default;
|
||||
ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
// clang-format off
|
||||
using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatchedGemmReduce_Xdl_CShuffle
|
||||
|
||||
Reference in New Issue
Block a user