Compile for gfx908 and gfx90a (#130)

* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
2026-04-20 06:49:15 +00:00 · 2022-03-31 12:33:34 -05:00
parent ecf337bab5
commit cd167e492a
227 changed files with 1398 additions and 2944 deletions
--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
@@ -1,45 +1,12 @@
-# Instructions for ```pool2d_fwd``` Example
+# Instructions for ```example_pool2d_fwd``` Example

-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-
-## Build ```pool2d_fwd```
-```bash
-mkdir build && cd build
-```
-
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-
-```bash
- make -j pool2d_fwd
-```
-
-## Run ```pool2d_fwd```
+## Run ```example_pool2d_fwd```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
-./example/pool2d_fwd 1 1 10
+./bin/example_pool2d_fwd 1 1 10
 ```

 Result 
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -22,9 +22,9 @@ using InLayout  = ck::tensor_layout::convolution::NHWC;
 using OutLayout = ck::tensor_layout::convolution::NHWC;

 #if 1
-static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::MAX;
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
 #else
-static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::AVG;
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
 #endif

 static constexpr bool NeedIndices  = false;
@@ -47,7 +47,7 @@ using DevicePoolFwdInstance =
 template <typename InDataType,
          typename OutDataType,
          typename AccDataType,
-          ck::ReduceTensorOp_t ReduceOpId,
+          ck::ReduceTensorOp ReduceOpId,
          bool PropagateNan,
          bool NeedIndices>
 static void pool_host_verify(const Tensor<InDataType>& in,