Merge commit '8fe3838c65ab4c290423ff0e952e882c19e2c60d' into develop

2026-05-14 10:09:41 +00:00 · 2025-09-24 17:12:28 +00:00
parent 95324c306e
commit bef885dc89
15 changed files with 50 additions and 47 deletions
--- a/29
+++ b/29
@@ -1,27 +1,23 @@
+
 FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.4.1
+ARG ROCMVERSION=7.0.1
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
 ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
+ENV DEBIAN_FRONTEND=noninteractive

 # Add rocm repository
 RUN set -xe && \
-    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
-    curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl

-RUN if [ "$ROCMVERSION" != "6.5" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \
-        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
-        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
-    fi
-
-RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
-    amdgpu-install -y --usecase=rocm --no-dkms
+RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \
+    apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \
+    apt update && \
+    apt install python3-setuptools python3-wheel -y && \
+    apt install rocm-dev -y

 ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
 ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
@@ -45,7 +41,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libelf-dev \
    libnuma-dev \
    libpthread-stubs0-dev \
-    llvm-amdgpu \
    mpich \
    net-tools \
    pkg-config \
@@ -61,17 +56,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    zip \
    libzstd-dev \
    openssh-server \
-    clang-format-12 \
    clang-format-18 \
    kmod && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf amdgpu-install* && \
-# Remove unnecessary rocm components that take a lot of space
-    apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
-
 #Install latest ccache
-RUN git clone https://github.com/ccache/ccache.git && \
+    git clone https://github.com/ccache/ccache.git && \
    cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
    cd / && \
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
--- a/21
+++ b/21
@@ -53,7 +53,7 @@ def getBaseDockerImageName(){
    }
    else{
        def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
-        if ( ROCM_numeric.major <= 6 && ROCM_numeric.minor < 5 ){
+        if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){
            img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
            }
        else{
@@ -930,7 +930,8 @@ def run_pytorch_tests(Map conf=[:]){
 }

 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true
+                                              0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -957,8 +958,8 @@ pipeline {
            description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
        string(
            name: 'ROCMVERSION',
-            defaultValue: '6.4.1',
-            description: 'Specify which ROCM version to use: 6.4.1 (default).')
+            defaultValue: '7.0.1',
+            description: 'Specify which ROCM version to use: 7.0.1 (default).')
        string(
            name: 'COMPILER_VERSION',
            defaultValue: '',
@@ -1037,8 +1038,8 @@ pipeline {
            description: "Build CK and run tests on gfx942 (default: ON)")
        booleanParam(
            name: "BUILD_GFX950",
-            defaultValue: false,
-            description: "Build CK and run tests on gfx950 (default: OFF)")
+            defaultValue: true,
+            description: "Build CK and run tests on gfx950 (default: ON)")
        booleanParam(
            name: "BUILD_GFX10",
            defaultValue: true,
@@ -1290,7 +1291,7 @@ pipeline {
                    agent{ label rocmnode("gfx90a")}
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
+                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \
                                           make -j64 check"""
                    }
                    steps{
@@ -1350,7 +1351,7 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx950") }
                    environment{
-                        def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0"
+                        def docker_name = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1"
                        setup_args = "NO_CK_BUILD"
                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx950 && \
                                           make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
@@ -1566,7 +1567,7 @@ pipeline {
                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1631,7 +1632,7 @@ pipeline {
                                    -D CMAKE_BUILD_TYPE=Release \
                                    -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """

-                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0")
+                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1")
                        }
                        cleanWs()
                    }
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -12,6 +12,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
 find_package(ROCM)
 include(ROCMInstallTargets)
 include(ROCMTest)
+find_package(hiprtc REQUIRED)

 rocm_setup_version(VERSION 1.0)

@@ -27,7 +28,7 @@ add_compile_options(-std=c++20)
 file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
 # TODO: Use object library
 add_library(ck_host STATIC ${SOURCES})
-target_link_libraries(ck_host PRIVATE ck_headers)
+target_link_libraries(ck_host PRIVATE ck_headers hiprtc::hiprtc)

 set_target_properties(ck_host PROPERTIES 
    LINKER_LANGUAGE CXX
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -4,6 +4,7 @@
 #pragma once

 #include "ck/config.h"
+#include <stdint.h>

 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -20,6 +20,7 @@ static constexpr bool is_scale_mfma_data_type()
           is_same_v<U, bf6_t> || is_same_v<U, f4_t>;
 }

+#ifndef CK_CODE_GEN_RTC
 /**
 * @brief Define scale data types that have hardware support for MX GEMMs
 */
@@ -28,6 +29,7 @@ static constexpr bool is_scale_mfma_scale_type()
 {
    return is_same_v<T, e8m0_bexp_t>;
 }
+#endif

 /**
 * @brief Combination of data types that have hardware support for MX GEMMs
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#include <stdint.h>
 #include "ck/utility/amd_ck_fp8.hpp"
 #include "ck/utility/e8m0.hpp"
 #include "ck/utility/statically_indexed_array.hpp"
@@ -325,12 +325,14 @@ struct scalar_type<bf8_ocp_t>
    static constexpr index_t vector_size = 1;
 };

+#ifndef CK_CODE_GEN_RTC
 template <>
 struct scalar_type<e8m0_bexp_t>
 {
    using type                           = e8m0_bexp_t::type;
    static constexpr index_t vector_size = 1;
 };
+#endif

 template <>
 struct scalar_type<f4x2_pk_t>
@@ -483,8 +485,10 @@ inline const char* get_type_name()
        return "f8";
    else if constexpr(is_same_v<T, bf8_t>)
        return "bf8";
+#ifndef CK_CODE_GEN_RTC
    else if constexpr(is_same_v<T, e8m0_bexp_t>)
        return "e8m0";
+#endif
    else if constexpr(is_same_v<T, float>)
        return "fp32";
 #if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
@@ -13,7 +13,7 @@ template <typename T, typename Enable = void>
 struct PrintAsType;

 template <typename T>
-struct PrintAsType<T, typename std::enable_if<std::is_floating_point<T>::value>::type>
+struct PrintAsType<T, typename enable_if<is_floating_point<T>::value>::type>
 {
    using type = float;
    __host__ __device__ static void Print(const T& p) { printf("%.3f ", static_cast<type>(p)); }
@@ -30,7 +30,7 @@ struct PrintAsType<ck::half_t, void>
 };

 template <typename T>
-struct PrintAsType<T, typename std::enable_if<std::is_integral<T>::value>::type>
+struct PrintAsType<T, typename enable_if<is_integral<T>::value>::type>
 {
    using type = int;
    __host__ __device__ static void Print(const T& p) { printf("%d ", static_cast<type>(p)); }
--- a/include/ck/utility/dtype_vector.hpp
+++ b/include/ck/utility/dtype_vector.hpp
@@ -1294,6 +1294,7 @@ struct nnvb_data_t_selector<bf8_ocp_t>
    using type = bf8_ocp_t::data_type;
 };

+#ifndef CK_CODE_GEN_RTC
 template <>
 struct nnvb_data_t_selector<f8_fnuz_t>
 {
@@ -1311,6 +1312,7 @@ struct nnvb_data_t_selector<e8m0_bexp_t>
 {
    using type = e8m0_bexp_t::type;
 };
+#endif

 template <>
 struct nnvb_data_t_selector<f6x16_pk_t>
@@ -2270,8 +2272,10 @@ using bf6x16_t   = typename vector_type<bf6x16_pk_t, 1>::type;
 using bf6x16x2_t = typename vector_type<bf6x16_pk_t, 2>::type;
 using bf6x32_t   = typename vector_type<bf6x32_pk_t, 1>::type;

+#ifndef CK_CODE_GEN_RTC
 // e8m0
 using e8m0x4_bexp_t = typename vector_type<e8m0_bexp_t, 4>::type;
+#endif

 // pack int4
 using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
--- a/include/ck/utility/e8m0.hpp
+++ b/include/ck/utility/e8m0.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#ifndef CK_CODE_GEN_RTC
 #include "ck/utility/type.hpp"

 namespace ck {
@@ -78,3 +79,4 @@ __host__ __device__ inline constexpr int32_t get_exponent_value<e8m0_bexp_t>(e8m
 } // namespace utils

 } // namespace ck
+#endif
--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -273,8 +273,8 @@ template <typename X, typename Y, bool negative_zero_nan, bool clip, bool stoch>
 __host__ __device__ Y cast_to_f8(X x, uint32_t rng)
 {
    // check datatypes
-    constexpr bool is_half  = std::is_same<X, half_t>::value;
-    constexpr bool is_float = std::is_same<X, float>::value;
+    constexpr bool is_half  = is_same<X, half_t>::value;
+    constexpr bool is_float = is_same<X, float>::value;
    static_assert(is_half || is_float, "Only half and float can be casted.");

    return run_cast_to_f8<X, Y, negative_zero_nan, clip, stoch>(x, rng);
@@ -284,8 +284,8 @@ template <typename X, typename Y, bool negative_zero_nan>
 __host__ __device__ Y cast_from_f8(X x)
 {
    // check datatype
-    constexpr bool is_half  = std::is_same<Y, half_t>::value;
-    constexpr bool is_float = std::is_same<Y, float>::value;
+    constexpr bool is_half  = is_same<Y, half_t>::value;
+    constexpr bool is_float = is_same<Y, float>::value;
    static_assert(is_half || is_float, "only half and float are supported.");

    return run_cast_from_f8<X, Y, negative_zero_nan>(x);
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -10,10 +10,6 @@
 #include "type.hpp"
 #include "tuple.hpp"

-#ifdef CK_CODE_GEN_RTC
-#define INT32_MAX 2147483647
-#endif
-
 namespace ck {

 // magic number division
--- a/include/ck/utility/numeric_limits.hpp
+++ b/include/ck/utility/numeric_limits.hpp
@@ -522,8 +522,6 @@ struct NumericLimits<bf6_t>
    }
 };

-#endif
-
 template <>
 struct NumericLimits<e8m0_bexp_t>
 {
@@ -551,5 +549,6 @@ struct NumericLimits<e8m0_bexp_t>
        return e8m0_bexp_t(binary_142);
    }
 };
+#endif

 } // namespace ck
--- a/include/ck/utility/numeric_utils.hpp
+++ b/include/ck/utility/numeric_utils.hpp
@@ -10,6 +10,7 @@ struct NumericUtils
 {
 };

+#ifndef CK_CODE_GEN_RTC
 template <>
 struct NumericUtils<e8m0_bexp_t>
 {
@@ -24,6 +25,7 @@ struct NumericUtils<e8m0_bexp_t>

    using bitwise_type = uint8_t;
 };
+#endif

 template <>
 struct NumericUtils<float>
--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
@@ -15,7 +15,7 @@ namespace ck {

 // Pseudo random number generator
 // version for fp32
-template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<float, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<is_same<float, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
    uint32_t x         = bit_cast<uint32_t>(val);
@@ -31,7 +31,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }

 // version for fp16
-template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<_Float16, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<is_same<_Float16, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
    uint16_t x         = bit_cast<uint16_t>(val);
@@ -48,7 +48,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 // return 0 if data is not fp16 or fp32
 template <typename T,
          uint32_t seed_t,
-          ck::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
+          ck::enable_if_t<!(is_same<float, T>{} || is_same<_Float16, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
    ck::ignore = id;