Merge commit 'f38751fc2aa0f84bca7eab7ff4a588ae9cf16a24' into develop

This commit is contained in:
assistant-librarian[bot]
2025-08-19 08:14:30 +00:00
parent b20efcc8c0
commit e9577d7ef3
23 changed files with 48 additions and 79 deletions

View File

@@ -7,7 +7,7 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
make tile_example_fmha_fwd -j
```
This will result in an executable `build/bin/tile_example_fmha_fwd`

View File

@@ -42,7 +42,7 @@ return hidden_states, per_token_scale
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_example_layernorm2d_fwd -j
```
This will result in an executable `build/bin/tile_example_layernorm2d_fwd`

View File

@@ -7,7 +7,7 @@ This folder contains example for GEMM using ck_tile tile-programming implementat
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
# The basic pipeline method on the gemm calculation
make tile_example_gemm_basic -j
# The memory bound pipeline on the gemm calculation

View File

@@ -7,7 +7,7 @@ This folder contains example for Image to Column using ck_tile tile-programming
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
make tile_example_img2col -j
```
This will result in an executable `build/bin/tile_example_img2col`

View File

@@ -15,7 +15,7 @@ args:
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_example_permute -j
```
This will result in an executable `build/bin/tile_example_permute`

View File

@@ -6,7 +6,7 @@ This folder contains example for topk-softmax kernel using ck_tile tile-programm
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_example_topk_softmax -j
```
This will result in an executable `build/bin/tile_example_topk_softmax`

View File

@@ -6,7 +6,7 @@ This folder contains example for Rmsnorm2D forward using ck_tile tile-programmin
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_rmsnorm2d_fwd -j
```
This will result in an executable `build/bin/tile_rmsnorm2d_fwd`

View File

@@ -6,7 +6,7 @@ This folder contains example for add + Rmsnorm2D + rowwise dynamic quantization
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_add_rmsnorm2d_rdquant_fwd -j
```
This will result in an executable `build/bin/tile_add_rmsnorm2d_rdquant_fwd`

View File

@@ -6,7 +6,7 @@ This folder contains example for smoothquant using ck_tile tile-programming impl
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_smoothquant -j
```
This will result in an executable `build/bin/tile_smoothquant`

View File

@@ -6,7 +6,7 @@ This folder contains example for moe-sorting kernel using ck_tile tile-programmi
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_example_moe_sorting -j
```
This will result in an executable `build/bin/tile_example_moe_sorting`

View File

@@ -9,7 +9,7 @@ Unlike standard smoothquant op, the input scale is from different expert `[exper
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_example_moe_smoothquant -j
```
This will result in an executable `build/bin/tile_example_moe_smoothquant`

View File

@@ -7,7 +7,7 @@ This folder contains example for batched GEMM using ck_tile tile-programming imp
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
make tile_example_batched_gemm -j
```
This will result in an executable `build/bin/tile_example_batched_gemm`

View File

@@ -148,7 +148,7 @@ All the necessary parameters are set, the tiling is computed, the GEMM pipeline
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
# The basic pipeline method on the gemm calculation
make tile_example_grouped_gemm -j
```

View File

@@ -7,7 +7,7 @@ This folder contains example for FLATMM using ck_tile tile-programming implement
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
# The basic pipeline method on the flatmm calculation
make tile_example_flatmm_basic -j
```

View File

@@ -8,7 +8,7 @@ This folder contains example for Multiple D GEMM using ck_tile tile-programming
mkdir build && cd build
#you can replace < arch> with the appropriate architecture(for example gfx90a or gfx942) or \
leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
#The basic pipeline method on the gemm calculation
make tile_example_gemm_multi_d_fp16 -j
```

View File

@@ -6,7 +6,7 @@ This folder contains example for batched Transpose using ck_tile tile-programmin
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
# Make the transpose executable
make tile_example_batched_transpose -j
```

View File

@@ -7,7 +7,7 @@ This folder contains example for Block Scale GEMM using ck_tile tile-programming
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
# The aquant pipeline method on the gemm calculation
make tile_example_gemm_aquant_basic -j
```

View File

@@ -12,7 +12,7 @@ This experimental kernel is intended for novice CK developers. It introduces the
mkdir build && cd build
# you can replace <arch> with the appropriate architecture
# (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
# Make the copy kernel executable
make tile_example_copy -j
```

View File

@@ -1,44 +1,47 @@
#!/bin/bash
set -euo pipefail
IFS=$'\n\t'
# exit when a command exits with non-zero status; also when an unbound variable is referenced
set -eu
# pipefail is supported by many shells, not supported by sh and dash
set -o pipefail 2>/dev/null | true
# when treating a string as a sequence, do not split on spaces
IFS=$(printf '\n\t')
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
# clean the build system files
find . -name CMakeFiles -type d -exec rm -rfv {} +
find . -name CMakeCache.txt -type f -exec rm -rv {} +
MY_PROJECT_SOURCE=$1
if [ $# -ge 1 ]; then
MY_PROJECT_SOURCE="$1"
shift 1
else
MY_PROJECT_SOURCE=".."
fi
GPU_TARGETS="gfx908;gfx90a;gfx942"
if [ $# -ge 2 ]; then
case "$2" in
gfx*)
GPU_TARGETS=$2
shift 2
if [ $# -ge 1 ]; then
case "$1" in
gfx*)
GPU_TARGETS=$1
shift 1
echo "GPU targets provided: $GPU_TARGETS"
REST_ARGS=$@
;;
*)
echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
GPU_TARGETS="gfx908;gfx90a;gfx942"
shift 1
REST_ARGS=$@
echo "No GPU targets provided, using default targets: $GPU_TARGETS"
;;
esac
else
echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
GPU_TARGETS="gfx908;gfx90a;gfx942"
shift 1
REST_ARGS=$@
echo "No GPU targets provided, using default targets: $GPU_TARGETS"
fi
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm/ \
-D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
-D CMAKE_CXX_FLAGS="-std=c++20 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
-D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
-D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=ON \
-D GPU_TARGETS=$GPU_TARGETS \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \
$REST_ARGS \
$@ \
${MY_PROJECT_SOURCE}

View File

@@ -1,34 +0,0 @@
#!/bin/bash
set -euo pipefail
IFS=$'\n\t'
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=$1
if [ $# -ge 2 ] && [[ "$2" =~ ^gfx ]]; then
GPU_TARGETS=$2
shift 2
echo "GPU targets provided: $GPU_TARGETS"
REST_ARGS=$@
else
echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
GPU_TARGETS="gfx908;gfx90a;gfx942"
shift 1
REST_ARGS=$@
fi
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
-D CMAKE_CXX_FLAGS="-O3" \
-D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=OFF \
-D GPU_TARGETS=$GPU_TARGETS \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \
$REST_ARGS \
${MY_PROJECT_SOURCE}

View File

@@ -12,7 +12,7 @@ is moved to output DRAM window for a simple copy operation.
mkdir build && cd build
# you can replace <arch> with the appropriate architecture
# (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
../script/cmake-ck-dev.sh ../ <arch>
# Make the copy kernel executable
make test_copy -j
```

View File

@@ -20,7 +20,7 @@ mkdir build && cd build
# replace [Arch] with the appropriate architecture or leave blank and
# replace [Datatype1;Datatype2;...] in comma separated datatypes string (possible datatypes are [fp8, bf8, int8, fp16, bf16])
# replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
sh ../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
# generate different executable for each passed datatype
make benchmark_gemm_[Datatype1]_[Layout1] -j
make benchmark_gemm_[Datatype1]_[Layout2] -j
@@ -38,7 +38,7 @@ rm -rf tile_engine/ && make benchmark_gemm_[Datatypes]_[Layout] -j # rebuild
## For eaxmple build for gfx942 for fp8 and fp16 datatypes with rcr layout
``` bash
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr"
../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr"
make benchmark_gemm_fp8_rcr -j
make benchmark_gemm_fp16_rcr -j
```

View File

@@ -21,7 +21,7 @@ mkdir build && cd build
# replace [Datatype] in comma separated datatypes string (possible datatypes are [fp16])
# replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
# replace "mul" with either of mul,add,passthrough for Elementwise function as Multiply, Add or Passthrough respectively. If this is not specified it is considered as mul by default.
sh ../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_MULTI_D_DATATYPE="[Datatype]" -DGEMM_MULTI_D_LAYOUT="[Layout1;Layout2]" -DGEMM_MULTI_D_ELEMENTWISE_FUNCTION="mul"
../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_MULTI_D_DATATYPE="[Datatype]" -DGEMM_MULTI_D_LAYOUT="[Layout1;Layout2]" -DGEMM_MULTI_D_ELEMENTWISE_FUNCTION="mul"
# generate different executable for each passed datatype
make benchmark_gemm_multi_d_[Datatype]_[Layout1] -j
make benchmark_gemm_multi_d_[Datatype]_[Layout2] -j
@@ -37,7 +37,7 @@ rm -rf tile_engine/ && make benchmark_gemm_multi_d_[Datatype]_[Layout] -j # reb
## For eaxmple build for gfx942 for datatype with rcr layout
``` bash
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_MULTI_D_DATATYPE="fp16" -DGEMM_MULTI_D_LAYOUT="rcrr"
../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_MULTI_D_DATATYPE="fp16" -DGEMM_MULTI_D_LAYOUT="rcrr"
make benchmark_gemm_multi_d_fp16_rcrr -j
## benchmark_gemm inputs