mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 02:54:21 +00:00
Merge commit 'f38751fc2aa0f84bca7eab7ff4a588ae9cf16a24' into develop
This commit is contained in:
@@ -7,7 +7,7 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
make tile_example_fmha_fwd -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_example_fmha_fwd`
|
||||
|
||||
@@ -42,7 +42,7 @@ return hidden_states, per_token_scale
|
||||
```
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
make tile_example_layernorm2d_fwd -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_example_layernorm2d_fwd`
|
||||
|
||||
@@ -7,7 +7,7 @@ This folder contains example for GEMM using ck_tile tile-programming implementat
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
# The basic pipeline method on the gemm calculation
|
||||
make tile_example_gemm_basic -j
|
||||
# The memory bound pipeline on the gemm calculation
|
||||
|
||||
@@ -7,7 +7,7 @@ This folder contains example for Image to Column using ck_tile tile-programming
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
make tile_example_img2col -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_example_img2col`
|
||||
|
||||
@@ -15,7 +15,7 @@ args:
|
||||
```
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
make tile_example_permute -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_example_permute`
|
||||
|
||||
@@ -6,7 +6,7 @@ This folder contains example for topk-softmax kernel using ck_tile tile-programm
|
||||
```
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
make tile_example_topk_softmax -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_example_topk_softmax`
|
||||
|
||||
@@ -6,7 +6,7 @@ This folder contains example for Rmsnorm2D forward using ck_tile tile-programmin
|
||||
```
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
make tile_rmsnorm2d_fwd -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_rmsnorm2d_fwd`
|
||||
|
||||
@@ -6,7 +6,7 @@ This folder contains example for add + Rmsnorm2D + rowwise dynamic quantization
|
||||
```
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
make tile_add_rmsnorm2d_rdquant_fwd -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_add_rmsnorm2d_rdquant_fwd`
|
||||
|
||||
@@ -6,7 +6,7 @@ This folder contains example for smoothquant using ck_tile tile-programming impl
|
||||
```
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
make tile_smoothquant -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_smoothquant`
|
||||
|
||||
@@ -6,7 +6,7 @@ This folder contains example for moe-sorting kernel using ck_tile tile-programmi
|
||||
```
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
make tile_example_moe_sorting -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_example_moe_sorting`
|
||||
|
||||
@@ -9,7 +9,7 @@ Unlike standard smoothquant op, the input scale is from different expert `[exper
|
||||
```
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
|
||||
make tile_example_moe_smoothquant -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_example_moe_smoothquant`
|
||||
|
||||
@@ -7,7 +7,7 @@ This folder contains example for batched GEMM using ck_tile tile-programming imp
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
make tile_example_batched_gemm -j
|
||||
```
|
||||
This will result in an executable `build/bin/tile_example_batched_gemm`
|
||||
|
||||
@@ -148,7 +148,7 @@ All the necessary parameters are set, the tiling is computed, the GEMM pipeline
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
# The basic pipeline method on the gemm calculation
|
||||
make tile_example_grouped_gemm -j
|
||||
```
|
||||
|
||||
@@ -7,7 +7,7 @@ This folder contains example for FLATMM using ck_tile tile-programming implement
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
# The basic pipeline method on the flatmm calculation
|
||||
make tile_example_flatmm_basic -j
|
||||
```
|
||||
|
||||
@@ -8,7 +8,7 @@ This folder contains example for Multiple D GEMM using ck_tile tile-programming
|
||||
mkdir build && cd build
|
||||
#you can replace < arch> with the appropriate architecture(for example gfx90a or gfx942) or \
|
||||
leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
#The basic pipeline method on the gemm calculation
|
||||
make tile_example_gemm_multi_d_fp16 -j
|
||||
```
|
||||
|
||||
@@ -6,7 +6,7 @@ This folder contains example for batched Transpose using ck_tile tile-programmin
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
# Make the transpose executable
|
||||
make tile_example_batched_transpose -j
|
||||
```
|
||||
|
||||
@@ -7,7 +7,7 @@ This folder contains example for Block Scale GEMM using ck_tile tile-programming
|
||||
# in the root of ck_tile
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
# The aquant pipeline method on the gemm calculation
|
||||
make tile_example_gemm_aquant_basic -j
|
||||
```
|
||||
|
||||
@@ -12,7 +12,7 @@ This experimental kernel is intended for novice CK developers. It introduces the
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture
|
||||
# (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
# Make the copy kernel executable
|
||||
make tile_example_copy -j
|
||||
```
|
||||
|
||||
@@ -1,44 +1,47 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
IFS=$'\n\t'
|
||||
# exit when a command exits with non-zero status; also when an unbound variable is referenced
|
||||
set -eu
|
||||
# pipefail is supported by many shells, not supported by sh and dash
|
||||
set -o pipefail 2>/dev/null | true
|
||||
# when treating a string as a sequence, do not split on spaces
|
||||
IFS=$(printf '\n\t')
|
||||
|
||||
rm -f CMakeCache.txt
|
||||
rm -f *.cmake
|
||||
rm -rf CMakeFiles
|
||||
# clean the build system files
|
||||
find . -name CMakeFiles -type d -exec rm -rfv {} +
|
||||
find . -name CMakeCache.txt -type f -exec rm -rv {} +
|
||||
|
||||
MY_PROJECT_SOURCE=$1
|
||||
if [ $# -ge 1 ]; then
|
||||
MY_PROJECT_SOURCE="$1"
|
||||
shift 1
|
||||
else
|
||||
MY_PROJECT_SOURCE=".."
|
||||
fi
|
||||
|
||||
GPU_TARGETS="gfx908;gfx90a;gfx942"
|
||||
|
||||
if [ $# -ge 2 ]; then
|
||||
case "$2" in
|
||||
gfx*)
|
||||
GPU_TARGETS=$2
|
||||
shift 2
|
||||
if [ $# -ge 1 ]; then
|
||||
case "$1" in
|
||||
gfx*)
|
||||
GPU_TARGETS=$1
|
||||
shift 1
|
||||
echo "GPU targets provided: $GPU_TARGETS"
|
||||
REST_ARGS=$@
|
||||
;;
|
||||
*)
|
||||
echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
|
||||
GPU_TARGETS="gfx908;gfx90a;gfx942"
|
||||
shift 1
|
||||
REST_ARGS=$@
|
||||
echo "No GPU targets provided, using default targets: $GPU_TARGETS"
|
||||
;;
|
||||
esac
|
||||
else
|
||||
echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
|
||||
GPU_TARGETS="gfx908;gfx90a;gfx942"
|
||||
shift 1
|
||||
REST_ARGS=$@
|
||||
echo "No GPU targets provided, using default targets: $GPU_TARGETS"
|
||||
fi
|
||||
|
||||
cmake \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm/ \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
||||
-D CMAKE_CXX_FLAGS="-std=c++20 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
|
||||
-D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D BUILD_DEV=ON \
|
||||
-D GPU_TARGETS=$GPU_TARGETS \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
-D USE_BITINT_EXTENSION_INT4=OFF \
|
||||
$REST_ARGS \
|
||||
$@ \
|
||||
${MY_PROJECT_SOURCE}
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
IFS=$'\n\t'
|
||||
|
||||
rm -f CMakeCache.txt
|
||||
rm -f *.cmake
|
||||
rm -rf CMakeFiles
|
||||
|
||||
MY_PROJECT_SOURCE=$1
|
||||
|
||||
if [ $# -ge 2 ] && [[ "$2" =~ ^gfx ]]; then
|
||||
GPU_TARGETS=$2
|
||||
shift 2
|
||||
echo "GPU targets provided: $GPU_TARGETS"
|
||||
REST_ARGS=$@
|
||||
else
|
||||
echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
|
||||
GPU_TARGETS="gfx908;gfx90a;gfx942"
|
||||
shift 1
|
||||
REST_ARGS=$@
|
||||
fi
|
||||
|
||||
cmake \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
|
||||
-D CMAKE_CXX_FLAGS="-O3" \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D GPU_TARGETS=$GPU_TARGETS \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
-D USE_BITINT_EXTENSION_INT4=OFF \
|
||||
$REST_ARGS \
|
||||
${MY_PROJECT_SOURCE}
|
||||
|
||||
@@ -12,7 +12,7 @@ is moved to output DRAM window for a simple copy operation.
|
||||
mkdir build && cd build
|
||||
# you can replace <arch> with the appropriate architecture
|
||||
# (for example gfx90a or gfx942) or leave it blank
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch>
|
||||
../script/cmake-ck-dev.sh ../ <arch>
|
||||
# Make the copy kernel executable
|
||||
make test_copy -j
|
||||
```
|
||||
|
||||
@@ -20,7 +20,7 @@ mkdir build && cd build
|
||||
# replace [Arch] with the appropriate architecture or leave blank and
|
||||
# replace [Datatype1;Datatype2;...] in comma separated datatypes string (possible datatypes are [fp8, bf8, int8, fp16, bf16])
|
||||
# replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
|
||||
sh ../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
|
||||
../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
|
||||
# generate different executable for each passed datatype
|
||||
make benchmark_gemm_[Datatype1]_[Layout1] -j
|
||||
make benchmark_gemm_[Datatype1]_[Layout2] -j
|
||||
@@ -38,7 +38,7 @@ rm -rf tile_engine/ && make benchmark_gemm_[Datatypes]_[Layout] -j # rebuild
|
||||
## For eaxmple build for gfx942 for fp8 and fp16 datatypes with rcr layout
|
||||
``` bash
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr"
|
||||
../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr"
|
||||
make benchmark_gemm_fp8_rcr -j
|
||||
make benchmark_gemm_fp16_rcr -j
|
||||
```
|
||||
|
||||
@@ -21,7 +21,7 @@ mkdir build && cd build
|
||||
# replace [Datatype] in comma separated datatypes string (possible datatypes are [fp16])
|
||||
# replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
|
||||
# replace "mul" with either of mul,add,passthrough for Elementwise function as Multiply, Add or Passthrough respectively. If this is not specified it is considered as mul by default.
|
||||
sh ../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_MULTI_D_DATATYPE="[Datatype]" -DGEMM_MULTI_D_LAYOUT="[Layout1;Layout2]" -DGEMM_MULTI_D_ELEMENTWISE_FUNCTION="mul"
|
||||
../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_MULTI_D_DATATYPE="[Datatype]" -DGEMM_MULTI_D_LAYOUT="[Layout1;Layout2]" -DGEMM_MULTI_D_ELEMENTWISE_FUNCTION="mul"
|
||||
# generate different executable for each passed datatype
|
||||
make benchmark_gemm_multi_d_[Datatype]_[Layout1] -j
|
||||
make benchmark_gemm_multi_d_[Datatype]_[Layout2] -j
|
||||
@@ -37,7 +37,7 @@ rm -rf tile_engine/ && make benchmark_gemm_multi_d_[Datatype]_[Layout] -j # reb
|
||||
## For eaxmple build for gfx942 for datatype with rcr layout
|
||||
``` bash
|
||||
mkdir build && cd build
|
||||
sh ../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_MULTI_D_DATATYPE="fp16" -DGEMM_MULTI_D_LAYOUT="rcrr"
|
||||
../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_MULTI_D_DATATYPE="fp16" -DGEMM_MULTI_D_LAYOUT="rcrr"
|
||||
make benchmark_gemm_multi_d_fp16_rcrr -j
|
||||
|
||||
## benchmark_gemm inputs
|
||||
|
||||
Reference in New Issue
Block a user