[CK_TILE] Change output accum tensor layout of fmha fwd split-kv & combine kernels (#1527)

* Use same layout for o_acc and o tensor

* Use better param names in partitioner

* Remove redundant kargs 'max_seqlen_q'

* Use better param names in splitkv kernel

* Add comment for additional kernel arguments

* Sync empty loop early return logics between pipelines

* Pass more arguments to cmake in scripts

* Align backslashes

* Fix wrong o_acc tensor view strides

* Change o_acc layout if o_perm=0

* Handle whole row masked via attn_bias

* Use use vector width = 1 for o_acc

* Use more even split sizes

[ROCm/composable_kernel commit: a1c07e8d91]
This commit is contained in:
Po Yen Chen
2024-10-01 22:13:52 +08:00
committed by GitHub
parent cdf713b239
commit beea8fa1f7
11 changed files with 104 additions and 97 deletions

View File

@@ -7,8 +7,10 @@ MY_PROJECT_SOURCE=$1
if [ $# -ge 2 ] ; then
GPU_TARGETS=$2
REST_ARGS=${@:3}
else
GPU_TARGETS="gfx908;gfx90a;gfx940"
REST_ARGS=
fi
cmake \
@@ -20,4 +22,5 @@ cmake
-D GPU_TARGETS=$GPU_TARGETS \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \
$REST_ARGS \
${MY_PROJECT_SOURCE}

View File

@@ -7,8 +7,10 @@ MY_PROJECT_SOURCE=$1
if [ $# -ge 2 ] ; then
GPU_TARGETS=$2
REST_ARGS=${@:3}
else
GPU_TARGETS="gfx908;gfx90a;gfx940"
REST_ARGS=
fi
cmake \
@@ -20,5 +22,6 @@ cmake
-D GPU_TARGETS=$GPU_TARGETS \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \
$REST_ARGS \
${MY_PROJECT_SOURCE}