Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa

This commit is contained in:
aska-0096
2025-08-11 15:59:40 +00:00
7 changed files with 31 additions and 217 deletions

View File

@@ -1,16 +1,20 @@
ARG BASE_DOCKER="rocm/pytorch:latest"
FROM $BASE_DOCKER
RUN groupadd -f render && \
ARG AITER_BRANCH="main"
ARG CK_AITER_BRANCH="develop"
RUN groupadd -g 109 render && \
usermod -u 1001 jenkins && \
groupmod -g 1001 jenkins && \
pip install pandas zmq einops && \
pip install numpy==1.26.2 && \
sudo mkdir /home/jenkins && \
sudo mkdir /home/jenkins/workspace && \
cd /home/jenkins/workspace && \
rm -rf aiter && \
git clone --recursive https://github.com/ROCm/aiter.git && \
git clone -b "$AITER_BRANCH" --recursive https://github.com/ROCm/aiter.git && \
cd aiter && \
rm -rf 3rdparty/composable_kernel/ && \
git clone https://github.com/ROCm/composable_kernel.git 3rdparty/composable_kernel/ && \
git clone -b "$CK_AITER_BRANCH" https://github.com/ROCm/composable_kernel.git 3rdparty/composable_kernel/ && \
python3 setup.py develop && \
chown -R jenkins:jenkins /home/jenkins/workspace && \
chmod -R a+rwx /home/jenkins/workspace && \

164
Jenkinsfile vendored
View File

@@ -190,7 +190,7 @@ def buildDocker(install_prefix){
}
else if(params.RUN_AITER_TESTS){
image_name = "rocm/composable_kernel:ck_aiter"
dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter . "
dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter --build-arg AITER_BRANCH='${params.aiter_branch}' --build-arg CK_AITER_BRANCH='${params.ck_aiter_branch}' . "
}
else{
dockerArgs = dockerArgs + " -f Dockerfile . "
@@ -438,34 +438,6 @@ def cmake_build(Map conf=[:]){
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
}
}
if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
try{
archiveArtifacts "perf_transpose_*.log"
if (arch_type == 1){
stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a"
}
else if (arch_type == 2){
stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942"
}
}
catch(Exception err){
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
}
}
if (params.RUN_CK_TILE_GEMM_TESTS){
try{
archiveArtifacts "perf_tile_gemm_**.log"
if (arch == 1){
stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
}
else if (arch == 2){
stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942"
}
}
catch(Exception err){
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
}
}
}
def buildHipClangJob(Map conf=[:]){
@@ -762,24 +734,6 @@ def process_results(Map conf=[:]){
echo "could not locate the FMHA performance logs: ${err.getMessage()}."
}
}
if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
try{
unstash "perf_transpose_log_gfx942"
unstash "perf_transpose_log_gfx90a"
}
catch(Exception err){
echo "could not locate the Transpose performance logs: ${err.getMessage()}."
}
}
if (params.RUN_CK_TILE_GEMM_TESTS){
try{
unstash "perf_tile_gemm_log_gfx942"
unstash "perf_tile_gemm_log_gfx90a"
}
catch(Exception err){
echo "could not locate the GEMM performance logs: ${err.getMessage()}."
}
}
if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){
// unstash deb packages
unstash "packages"
@@ -843,10 +797,10 @@ def run_aiter_tests(Map conf=[:]){
withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 45, unit: 'MINUTES'){
try{
sh "python3 --version"
sh "rocminfo"
sh "python3 ../aiter/op_tests/test_gemm_a8w8_blockscale.py"
//sh "python3 ../aiter/op_tests/test_mha.py"
sh "python3 --version"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
}
catch(e){
echo "Throwing error exception while running AITER tests"
@@ -861,7 +815,7 @@ def run_aiter_tests(Map conf=[:]){
}
//launch develop branch daily jobs
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -941,14 +895,6 @@ pipeline {
name: "RUN_CK_TILE_FMHA_TESTS",
defaultValue: false,
description: "Run the ck_tile FMHA tests (default: OFF)")
booleanParam(
name: "RUN_CK_TILE_TRANSPOSE_TESTS",
defaultValue: false,
description: "Run the ck_tile Transpose tests (default: OFF)")
booleanParam(
name: "RUN_CK_TILE_GEMM_TESTS",
defaultValue: false,
description: "Run the ck_tile GEMM tests (default: OFF)")
booleanParam(
name: "RUN_TILE_ENGINE_GEMM_TESTS",
defaultValue: false,
@@ -1009,6 +955,14 @@ pipeline {
name: "RUN_AITER_TESTS",
defaultValue: false,
description: "Run AITER tests with latest CK develop branch (default: OFF)")
string(
name: 'aiter_branch',
defaultValue: 'main',
description: 'Specify which branch of AITER to use (default: main)')
string(
name: 'ck_aiter_branch',
defaultValue: 'develop',
description: 'Specify which branch of CK to test with AITER (default: develop)')
}
environment{
dbuser = "${dbuser}"
@@ -1093,13 +1047,13 @@ pipeline {
{
parallel
{
stage("Run AITER Tests on gfx90a")
stage("Run AITER Tests on gfx942")
{
when {
beforeAgent true
expression { params.RUN_AITER_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
agent{ label rocmnode("gfx942")}
steps{
run_aiter_tests()
cleanWs()
@@ -1198,94 +1152,6 @@ pipeline {
}
}
}
stage("Run CK_TILE_TRANSPOSE Tests")
{
parallel
{
stage("Run CK_TILE_TRANSPOSE Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 tile_example_batched_transpose && \
cd ../ &&
example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
stage("Run CK_TILE_TRANSPOSE Tests on gfx942")
{
when {
beforeAgent true
expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
make -j64 tile_example_batched_transpose && \
cd ../ &&
example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Run CK_TILE_GEMM Tests")
{
parallel
{
stage("Run CK_TILE_GEMM Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 tile_example_gemm_universal && \
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
stage("Run CK_TILE_GEMM Tests on gfx942")
{
when {
beforeAgent true
expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
make -j64 tile_example_gemm_universal && \
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Run TILE_ENGINE_GEMM Tests")
{
parallel

View File

@@ -96,7 +96,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
4. Build the entire CK library:
```bash
make -j
make -j"$(nproc)"
```
5. Install CK:
@@ -213,4 +213,4 @@ script/uninstall_precommit.sh
```
If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
`git commit` command.
`git commit` command.

View File

@@ -35,8 +35,6 @@ struct Add
return type_convert<T>(y_ + x_);
}
static constexpr bool requires_special_combine = false;
};
struct SquareAdd
@@ -64,28 +62,6 @@ struct SquareAdd
float x_ = type_convert<float>(x);
return type_convert<T>(y_ + (x_ * x_));
}
// For combining partial results
template <typename T,
typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
CK_TILE_HOST_DEVICE constexpr T combine_partial_results(const T& partial1,
const T& partial2) const
{
return partial1 + partial2; // Just add the partial sums, don't square again
}
template <typename T,
typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
CK_TILE_HOST_DEVICE constexpr T combine_partial_results(T& partial1, T& partial2) const
{
float partial1_ = type_convert<float>(partial1);
float partial2_ = type_convert<float>(partial2);
return type_convert<T>(partial1_ + partial2_);
}
static constexpr bool requires_special_combine = true;
};
struct Max
@@ -109,8 +85,6 @@ struct Max
{
return max(y, x);
}
static constexpr bool requires_special_combine = false;
};
struct AbsMax
@@ -134,8 +108,6 @@ struct AbsMax
{
return max(y, abs(x));
}
static constexpr bool requires_special_combine = false;
};
} // namespace ReduceOp

View File

@@ -183,16 +183,7 @@ struct BlockReduce2dSync
// pull data from remote lane
const auto v_remote = warp_shuffle(v_local, src_lane);
// For reduce, use combine_partial_results for operations that require it
if constexpr(ReduceFunc::requires_special_combine)
{
v_local = reduce_func.combine_partial_results(v_local, v_remote);
}
else
{
v_local = reduce_func(v_local, v_remote);
}
v_local = reduce_func(v_local, v_remote);
});
}
});
@@ -309,16 +300,7 @@ struct BlockReduce2dCrossWarpSync
static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) {
constexpr auto i_1 = number<i_1_n1 + 1>{};
const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1];
// For reduce, use combine_partial_results for operations that require it
if constexpr(ReduceFunc::requires_special_combine)
{
v_local = reduce_func.combine_partial_results(v_local, v_remote);
}
else
{
v_local = reduce_func(v_local, v_remote);
}
v_local = reduce_func(v_local, v_remote);
});
y_tensor.get_thread_buffer()(i_0) = v_local;

View File

@@ -189,7 +189,9 @@ struct Reduce
/// @note Requirements:
/// - y_continous_dim % ThreadTile_N == 0 (for proper thread distribution)
/// - input_strides[-1] == 1 (for contiguous memory access)
CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim, auto input_strides)
template <typename InputStrides>
CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim,
InputStrides input_strides)
{
using S = typename Problem::BlockShape;

View File

@@ -308,20 +308,8 @@ using TestConfig_F32_Max = std::tuple<float,
Shape1_WarpTile,
Shape1_ThreadTile>;
using TestConfig_F32_SquareAdd = std::tuple<float,
float,
float,
ck_tile::ReduceOp::SquareAdd,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_ThreadTile>;
using TestTypes = ::testing::Types<TestConfig_F32_Add,
TestConfig_F16_Add,
TestConfig_F32_CrossWarp,
TestConfig_F32_Max,
TestConfig_F32_SquareAdd>;
using TestTypes = ::testing::
Types<TestConfig_F32_Add, TestConfig_F16_Add, TestConfig_F32_CrossWarp, TestConfig_F32_Max>;
TYPED_TEST_SUITE(TestCkTileReduce, TestTypes);