diff --git a/Dockerfile b/Dockerfile index 6f5cd0115d..07327442fe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,27 +1,23 @@ + FROM ubuntu:24.04 ARG DEBIAN_FRONTEND=noninteractive -ARG ROCMVERSION=6.4.1 +ARG ROCMVERSION=7.0.1 ARG compiler_version="" ARG compiler_commit="" ARG CK_SCCACHE="" ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn +ENV DEBIAN_FRONTEND=noninteractive # Add rocm repository RUN set -xe && \ - apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \ - curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg + apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl -RUN if [ "$ROCMVERSION" != "6.5" ]; then \ - sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb --no-check-certificate" && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \ - wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ - sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \ - sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \ - fi - -RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \ - amdgpu-install -y --usecase=rocm --no-dkms +RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \ + apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \ + apt update && \ + apt install python3-setuptools python3-wheel -y && \ + apt install rocm-dev -y ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache @@ -45,7 +41,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- libelf-dev \ libnuma-dev \ libpthread-stubs0-dev \ - llvm-amdgpu \ mpich \ net-tools \ pkg-config \ @@ -61,17 +56,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- zip \ libzstd-dev \ openssh-server \ - clang-format-12 \ clang-format-18 \ kmod && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* && \ rm -rf amdgpu-install* && \ -# Remove unnecessary rocm components that take a lot of space - apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt - #Install latest ccache -RUN git clone https://github.com/ccache/ccache.git && \ + git clone https://github.com/ccache/ccache.git && \ cd ccache && mkdir build && cd build && cmake .. && make install && \ #Install ninja build tracing tools cd / && \ diff --git a/Dockerfile.compiler b/Dockerfile.compiler index 0306057e45..47bd8294b6 100644 --- a/Dockerfile.compiler +++ b/Dockerfile.compiler @@ -1,4 +1,4 @@ -ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1" +ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1" FROM $BASE_DOCKER ARG compiler_version="" ARG compiler_commit="" diff --git a/Jenkinsfile b/Jenkinsfile index efe08a7d41..6eaf73201e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -53,7 +53,7 @@ def getBaseDockerImageName(){ } else{ def ROCM_numeric = parseVersion("${params.ROCMVERSION}") - if ( ROCM_numeric.major <= 6 && ROCM_numeric.minor < 5 ){ + if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){ img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}" } else{ @@ -930,7 +930,8 @@ def run_pytorch_tests(Map conf=[:]){ } //launch develop branch daily jobs -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true + 0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true @@ -957,8 +958,8 @@ pipeline { description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') string( name: 'ROCMVERSION', - defaultValue: '6.4.1', - description: 'Specify which ROCM version to use: 6.4.1 (default).') + defaultValue: '7.0.1', + description: 'Specify which ROCM version to use: 7.0.1 (default).') string( name: 'COMPILER_VERSION', defaultValue: '', @@ -1037,8 +1038,8 @@ pipeline { description: "Build CK and run tests on gfx942 (default: ON)") booleanParam( name: "BUILD_GFX950", - defaultValue: false, - description: "Build CK and run tests on gfx950 (default: OFF)") + defaultValue: true, + description: "Build CK and run tests on gfx950 (default: ON)") booleanParam( name: "BUILD_GFX10", defaultValue: true, @@ -1290,7 +1291,7 @@ pipeline { agent{ label rocmnode("gfx90a")} environment{ setup_args = "NO_CK_BUILD" - execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \ + execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \ make -j64 check""" } steps{ @@ -1350,7 +1351,7 @@ pipeline { } agent{ label rocmnode("gfx950") } environment{ - def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0" + def docker_name = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1" setup_args = "NO_CK_BUILD" execute_args = """ ../script/cmake-ck-dev.sh ../ gfx950 && \ make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \ @@ -1566,7 +1567,7 @@ pipeline { -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ - Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } @@ -1631,7 +1632,7 @@ pipeline { -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """ - buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0") + buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1") } cleanWs() } diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt index 2b2e6e2949..80429a781b 100644 --- a/codegen/CMakeLists.txt +++ b/codegen/CMakeLists.txt @@ -12,6 +12,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h) find_package(ROCM) include(ROCMInstallTargets) include(ROCMTest) +find_package(hiprtc REQUIRED) rocm_setup_version(VERSION 1.0) @@ -27,7 +28,7 @@ add_compile_options(-std=c++20) file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp) # TODO: Use object library add_library(ck_host STATIC ${SOURCES}) -target_link_libraries(ck_host PRIVATE ck_headers) +target_link_libraries(ck_host PRIVATE ck_headers hiprtc::hiprtc) set_target_properties(ck_host PROPERTIES LINKER_LANGUAGE CXX diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 5783605f8d..7aee7fca28 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -4,6 +4,7 @@ #pragma once #include "ck/config.h" +#include #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index a97d9589cf..a86aa2f8ef 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -20,6 +20,7 @@ static constexpr bool is_scale_mfma_data_type() is_same_v || is_same_v; } +#ifndef CK_CODE_GEN_RTC /** * @brief Define scale data types that have hardware support for MX GEMMs */ @@ -28,6 +29,7 @@ static constexpr bool is_scale_mfma_scale_type() { return is_same_v; } +#endif /** * @brief Combination of data types that have hardware support for MX GEMMs diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 984bb4d862..574269b94a 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -2,7 +2,7 @@ // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once - +#include #include "ck/utility/amd_ck_fp8.hpp" #include "ck/utility/e8m0.hpp" #include "ck/utility/statically_indexed_array.hpp" @@ -325,12 +325,14 @@ struct scalar_type static constexpr index_t vector_size = 1; }; +#ifndef CK_CODE_GEN_RTC template <> struct scalar_type { using type = e8m0_bexp_t::type; static constexpr index_t vector_size = 1; }; +#endif template <> struct scalar_type @@ -483,8 +485,10 @@ inline const char* get_type_name() return "f8"; else if constexpr(is_same_v) return "bf8"; +#ifndef CK_CODE_GEN_RTC else if constexpr(is_same_v) return "e8m0"; +#endif else if constexpr(is_same_v) return "fp32"; #if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC) diff --git a/include/ck/utility/debug.hpp b/include/ck/utility/debug.hpp index 45d443ae49..1b86b33777 100644 --- a/include/ck/utility/debug.hpp +++ b/include/ck/utility/debug.hpp @@ -13,7 +13,7 @@ template struct PrintAsType; template -struct PrintAsType::value>::type> +struct PrintAsType::value>::type> { using type = float; __host__ __device__ static void Print(const T& p) { printf("%.3f ", static_cast(p)); } @@ -30,7 +30,7 @@ struct PrintAsType }; template -struct PrintAsType::value>::type> +struct PrintAsType::value>::type> { using type = int; __host__ __device__ static void Print(const T& p) { printf("%d ", static_cast(p)); } diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp index 27a7545a0e..084240f84b 100644 --- a/include/ck/utility/dtype_vector.hpp +++ b/include/ck/utility/dtype_vector.hpp @@ -1294,6 +1294,7 @@ struct nnvb_data_t_selector using type = bf8_ocp_t::data_type; }; +#ifndef CK_CODE_GEN_RTC template <> struct nnvb_data_t_selector { @@ -1311,6 +1312,7 @@ struct nnvb_data_t_selector { using type = e8m0_bexp_t::type; }; +#endif template <> struct nnvb_data_t_selector @@ -2270,8 +2272,10 @@ using bf6x16_t = typename vector_type::type; using bf6x16x2_t = typename vector_type::type; using bf6x32_t = typename vector_type::type; +#ifndef CK_CODE_GEN_RTC // e8m0 using e8m0x4_bexp_t = typename vector_type::type; +#endif // pack int4 using pk_i4x2_t = typename vector_type::type; diff --git a/include/ck/utility/e8m0.hpp b/include/ck/utility/e8m0.hpp index f7d2a2f594..ac2a114593 100644 --- a/include/ck/utility/e8m0.hpp +++ b/include/ck/utility/e8m0.hpp @@ -3,6 +3,7 @@ #pragma once +#ifndef CK_CODE_GEN_RTC #include "ck/utility/type.hpp" namespace ck { @@ -78,3 +79,4 @@ __host__ __device__ inline constexpr int32_t get_exponent_value(e8m } // namespace utils } // namespace ck +#endif diff --git a/include/ck/utility/f8_utils.hpp b/include/ck/utility/f8_utils.hpp index 748aa07f9e..94c2f84c8c 100644 --- a/include/ck/utility/f8_utils.hpp +++ b/include/ck/utility/f8_utils.hpp @@ -273,8 +273,8 @@ template __host__ __device__ Y cast_to_f8(X x, uint32_t rng) { // check datatypes - constexpr bool is_half = std::is_same::value; - constexpr bool is_float = std::is_same::value; + constexpr bool is_half = is_same::value; + constexpr bool is_float = is_same::value; static_assert(is_half || is_float, "Only half and float can be casted."); return run_cast_to_f8(x, rng); @@ -284,8 +284,8 @@ template __host__ __device__ Y cast_from_f8(X x) { // check datatype - constexpr bool is_half = std::is_same::value; - constexpr bool is_float = std::is_same::value; + constexpr bool is_half = is_same::value; + constexpr bool is_float = is_same::value; static_assert(is_half || is_float, "only half and float are supported."); return run_cast_from_f8(x); diff --git a/include/ck/utility/magic_division.hpp b/include/ck/utility/magic_division.hpp index 993b70a3fb..7227cee754 100644 --- a/include/ck/utility/magic_division.hpp +++ b/include/ck/utility/magic_division.hpp @@ -10,10 +10,6 @@ #include "type.hpp" #include "tuple.hpp" -#ifdef CK_CODE_GEN_RTC -#define INT32_MAX 2147483647 -#endif - namespace ck { // magic number division diff --git a/include/ck/utility/numeric_limits.hpp b/include/ck/utility/numeric_limits.hpp index e59b7eceaf..b8d6280acc 100644 --- a/include/ck/utility/numeric_limits.hpp +++ b/include/ck/utility/numeric_limits.hpp @@ -522,8 +522,6 @@ struct NumericLimits } }; -#endif - template <> struct NumericLimits { @@ -551,5 +549,6 @@ struct NumericLimits return e8m0_bexp_t(binary_142); } }; +#endif } // namespace ck diff --git a/include/ck/utility/numeric_utils.hpp b/include/ck/utility/numeric_utils.hpp index 726f667518..399bc0c3e8 100644 --- a/include/ck/utility/numeric_utils.hpp +++ b/include/ck/utility/numeric_utils.hpp @@ -10,6 +10,7 @@ struct NumericUtils { }; +#ifndef CK_CODE_GEN_RTC template <> struct NumericUtils { @@ -24,6 +25,7 @@ struct NumericUtils using bitwise_type = uint8_t; }; +#endif template <> struct NumericUtils diff --git a/include/ck/utility/random_gen.hpp b/include/ck/utility/random_gen.hpp index 2ff46457fc..dd2662b6d9 100644 --- a/include/ck/utility/random_gen.hpp +++ b/include/ck/utility/random_gen.hpp @@ -15,7 +15,7 @@ namespace ck { // Pseudo random number generator // version for fp32 -template {}, bool> = false> +template {}, bool> = false> __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t) { uint32_t x = bit_cast(val); @@ -31,7 +31,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = } // version for fp16 -template {}, bool> = false> +template {}, bool> = false> __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t) { uint16_t x = bit_cast(val); @@ -48,7 +48,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = // return 0 if data is not fp16 or fp32 template {} || std::is_same<_Float16, T>{}), bool> = false> + ck::enable_if_t{} || is_same<_Float16, T>{}), bool> = false> __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t) { ck::ignore = id;