From f5d1e3fa4878fcfa380082e357e89152756327ce Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 16 Jul 2025 07:37:53 -0700 Subject: [PATCH 1/4] Use a clang20 compiler for gfx950 builds. (#2504) * update docker tag for gfx950 ci build * update compiler path for gfx950 ci build * suppress compiler path override for gfx950 * clean up --- Jenkinsfile | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 50c15701a7..a7dc8360ee 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -234,11 +234,6 @@ def cmake_build(Map conf=[:]){ def build_type_debug = (conf.get("build_type",'release') == 'debug') - // use special compiler for gfx950 - if ( check_arch() == 7){ - compiler = "/llvm-project/build/bin/clang++" - } - //cmake_env can overwrite default CXX variables. def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","") @@ -1352,12 +1347,12 @@ pipeline { execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ -DGPU_TARGETS="gfx950" \ - -DCMAKE_CXX_COMPILER=/llvm-project/build/bin/clang++ \ + -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ - Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } From a4bf78ac0ec5882692423bd5b58d84feb3488629 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 16 Jul 2025 07:39:15 -0700 Subject: [PATCH 2/4] replace obsolete warpSize system variable with the new one (#2496) --- .../gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp index 156db6e636..be85528f28 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp @@ -467,7 +467,7 @@ struct GridwiseMoeGemmMX_BPreshuffle __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0) { - constexpr index_t NkSwizzleNumber = Number{}; + constexpr index_t NkSwizzleNumber = Number{}; return make_naive_tensor_descriptor_packed( make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber)); } @@ -1474,7 +1474,7 @@ struct GridwiseMoeGemmMX_BPreshuffle make_multi_index(n_block_data_idx_on_grid, get_warp_local_1d_id() % NWave, 0, - KPack / KGroup * (get_thread_local_1d_id() % warpSize))); + KPack / KGroup * (get_thread_local_1d_id() % WarpSize))); // LDS allocation for A and B: be careful of alignment // Cast after lds @@ -1567,7 +1567,7 @@ struct GridwiseMoeGemmMX_BPreshuffle make_multi_index(n_block_data_idx_on_grid, get_warp_local_1d_id() % NWave, 0, - KPack / KGroup * (get_thread_local_1d_id() % warpSize))); + KPack / KGroup * (get_thread_local_1d_id() % WarpSize))); const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2; const auto b_scale_grid_buf_up = make_dynamic_buffer( p_b_scale_grid_up + expert_id * expert_scale_stride, @@ -2185,7 +2185,7 @@ struct GridwiseMoeGemmMX_BPreshuffle get_warp_local_1d_id() % NWave, 0, 0, - KPack * (get_thread_local_1d_id() % warpSize))); + KPack * (get_thread_local_1d_id() % WarpSize))); // LDS allocation for A and B: be careful of alignment // Cast after lds @@ -2289,7 +2289,7 @@ struct GridwiseMoeGemmMX_BPreshuffle get_warp_local_1d_id() % NWave, 0, 0, - KPack * (get_thread_local_1d_id() % warpSize))); + KPack * (get_thread_local_1d_id() % WarpSize))); const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType); const auto b_scale_grid_buf_up = make_dynamic_buffer( From 6e76b82059eceb1a1614f4a335c70faa2d122c97 Mon Sep 17 00:00:00 2001 From: linqunAMD Date: Wed, 16 Jul 2025 22:58:23 +0800 Subject: [PATCH 3/4] Fix build errors on windows (#2456) * Fix build errors on windows * correct clang format --------- Co-authored-by: Lin, Qun --- cmake/gtest.cmake | 3 ++ .../34_batchnorm/batchnorm_backward_nhwc.cpp | 4 +- .../batchnorm_forward_inferring_nhwc.cpp | 5 +-- .../batchnorm_forward_training_nhwc.cpp | 7 ++-- ...tchnorm_forward_training_nhwc_obsolete.cpp | 7 ++-- example/CMakeLists.txt | 1 + include/ck/utility/amd_xdlops.hpp | 32 +++++++------- include/ck/utility/env.hpp | 1 + include/ck/utility/synchronization.hpp | 2 +- .../ops/gemm/kernel/batched_gemm_kernel.hpp | 2 +- .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 2 +- .../ops/gemm/kernel/grouped_gemm_kernel.hpp | 4 +- .../warp/warp_gemm_attribute_mfma_impl.hpp | 42 +++++++++---------- .../include/profiler/profile_gemm_impl.hpp | 4 ++ profiler/src/profile_batched_gemm_b_scale.cpp | 3 +- profiler/src/profile_gemm_b_scale.cpp | 3 +- test/scatter_gather/scatter_gather.cpp | 4 +- 17 files changed, 67 insertions(+), 59 deletions(-) diff --git a/cmake/gtest.cmake b/cmake/gtest.cmake index 0915f53411..6587f4c4be 100644 --- a/cmake/gtest.cmake +++ b/cmake/gtest.cmake @@ -68,3 +68,6 @@ endif() target_compile_options(gtest PRIVATE ${GTEST_CXX_FLAGS}) target_compile_options(gtest_main PRIVATE ${GTEST_CXX_FLAGS}) +target_compile_definitions(gtest PRIVATE GTEST_HAS_SEH=0) +target_compile_definitions(gtest_main PRIVATE GTEST_HAS_SEH=0) + diff --git a/example/34_batchnorm/batchnorm_backward_nhwc.cpp b/example/34_batchnorm/batchnorm_backward_nhwc.cpp index 3756310fd7..9737b0d99b 100644 --- a/example/34_batchnorm/batchnorm_backward_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_backward_nhwc.cpp @@ -403,10 +403,10 @@ bool bnorm_bwd_nhwc_test(bool do_verification, return (pass); }; -static const double epsilon = std::numeric_limits::epsilon(); - int main(int argc, char* argv[]) { + static const double epsilon = std::numeric_limits::epsilon(); + bool pass = true; if(argc > 1) diff --git a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp index 6a8002025a..1ffbabd04b 100644 --- a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp @@ -314,11 +314,10 @@ bool bnorm_infer_nhwc_test(bool do_verification, return (pass); }; -static const double epsilon = std::numeric_limits::epsilon(); - int main(int argc, char* argv[]) { - bool pass = true; + static const double epsilon = std::numeric_limits::epsilon(); + bool pass = true; if(argc > 1) { diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp index b27358fd9d..06441be860 100644 --- a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp @@ -453,12 +453,11 @@ bool bnorm_fwd_nhwc_test(bool do_verification, return (pass); }; -const double epsilon = std::numeric_limits::epsilon(); -static const double averageFactor = 0.1; - int main(int argc, char* argv[]) { - bool pass = true; + const double epsilon = std::numeric_limits::epsilon(); + static const double averageFactor = 0.1; + bool pass = true; if(argc > 1) { diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp index ffb9f4b584..8f2b7613b5 100644 --- a/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp +++ b/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp @@ -453,12 +453,11 @@ bool bnorm_fwd_nhwc_test(bool do_verification, return (pass); }; -const double epsilon = std::numeric_limits::epsilon(); -static const double averageFactor = 0.1; - int main(int argc, char* argv[]) { - bool pass = true; + const double epsilon = std::numeric_limits::epsilon(); + static const double averageFactor = 0.1; + bool pass = true; if(argc > 1) { diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 56d709f41b..3c67e9214f 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -128,6 +128,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME) set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP) add_executable(${EXAMPLE_NAME} ${FILE_NAME}) target_link_libraries(${EXAMPLE_NAME} PRIVATE utility) + target_link_libraries(${EXAMPLE_NAME} PRIVATE getopt::getopt) add_test(NAME ${EXAMPLE_NAME} COMMAND $ ${ARGN}) set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS} ) add_dependencies(examples ${EXAMPLE_NAME}) diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index 8646b8393b..02a7a72b8c 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -1396,8 +1396,8 @@ struct intrin_mfma_f32_32x32x16f8f8<32, 32> #if defined(__gfx94__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( - bit_cast(reg_a), - bit_cast(reg_b), + bit_cast(reg_a), + bit_cast(reg_b), reg_c.template AsType()[Number<0>{}], 0, 0, @@ -1427,8 +1427,8 @@ struct intrin_mfma_f32_16x16x32f8f8<16, 16> { #if defined(__gfx94__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8( - bit_cast(reg_a), - bit_cast(reg_b), + bit_cast(reg_a), + bit_cast(reg_b), reg_c.template AsType()[Number<0>{}], 0, 0, @@ -1459,8 +1459,8 @@ struct intrin_mfma_f32_32x32x16bf8bf8<32, 32> #if defined(__gfx94__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8( - bit_cast(reg_a), - bit_cast(reg_b), + bit_cast(reg_a), + bit_cast(reg_b), reg_c.template AsType()[Number<0>{}], 0, 0, @@ -1490,8 +1490,8 @@ struct intrin_mfma_f32_16x16x32bf8bf8<16, 16> { #if defined(__gfx94__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8( - bit_cast(reg_a), - bit_cast(reg_b), + bit_cast(reg_a), + bit_cast(reg_b), reg_c.template AsType()[Number<0>{}], 0, 0, @@ -1522,8 +1522,8 @@ struct intrin_mfma_f32_32x32x16f8bf8<32, 32> #if defined(__gfx94__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8( - bit_cast(reg_a), - bit_cast(reg_b), + bit_cast(reg_a), + bit_cast(reg_b), reg_c.template AsType()[Number<0>{}], 0, 0, @@ -1553,8 +1553,8 @@ struct intrin_mfma_f32_16x16x32f8bf8<16, 16> { #if defined(__gfx94__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8( - bit_cast(reg_a), - bit_cast(reg_b), + bit_cast(reg_a), + bit_cast(reg_b), reg_c.template AsType()[Number<0>{}], 0, 0, @@ -1585,8 +1585,8 @@ struct intrin_mfma_f32_32x32x16bf8f8<32, 32> #if defined(__gfx94__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8( - bit_cast(reg_a), - bit_cast(reg_b), + bit_cast(reg_a), + bit_cast(reg_b), reg_c.template AsType()[Number<0>{}], 0, 0, @@ -1616,8 +1616,8 @@ struct intrin_mfma_f32_16x16x32bf8f8<16, 16> { #if defined(__gfx94__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8( - bit_cast(reg_a), - bit_cast(reg_b), + bit_cast(reg_a), + bit_cast(reg_b), reg_c.template AsType()[Number<0>{}], 0, 0, diff --git a/include/ck/utility/env.hpp b/include/ck/utility/env.hpp index 46ba32bb87..2f5b804d16 100644 --- a/include/ck/utility/env.hpp +++ b/include/ck/utility/env.hpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace ck { namespace internal { diff --git a/include/ck/utility/synchronization.hpp b/include/ck/utility/synchronization.hpp index d6b6eac26c..7652e73809 100644 --- a/include/ck/utility/synchronization.hpp +++ b/include/ck/utility/synchronization.hpp @@ -33,7 +33,7 @@ __device__ void block_sync_lds_direct_load() { #ifdef __gfx12__ asm volatile("\ - s_wait_vmcnt 0x0 \n \ + s_wait_loadcnt 0x0 \n \ s_wait_dscnt 0x0 \n \ s_barrier_signal -1 \n \ s_barrier_wait -1 \ diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp index 09c7d58558..fc72138abf 100644 --- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -74,7 +74,7 @@ struct BatchedGemmKernel : public GemmKernel, + return concat('_', "gemm_batched", gemm_prec_str(), concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock), concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()), concat('x', P_::kPadM, P_::kPadN, P_::kPadK)); diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 516d4298ef..53c21b49f5 100755 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -196,7 +196,7 @@ struct GemmKernel [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off - return concat('_', "gemm", gemm_prec_str, GemmPipeline::GetName()); + return concat('_', "gemm", gemm_prec_str(), GemmPipeline::GetName()); // clang-format on } diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index 533cabb736..2605b1afbc 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -57,7 +57,7 @@ struct GroupedGemmKernel : public GemmKernel, + return concat('_', "gemm_grouped", gemm_prec_str(), concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock), concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()), concat('x', P_::kPadM, P_::kPadN, P_::kPadK), @@ -95,7 +95,7 @@ struct GroupedGemmKernel : public GemmKernel>& gemm_descs) { index_t grid_size = 0; diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp index 80f38f263b..0831cf85c4 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp @@ -1095,16 +1095,16 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base #if defined(__gfx94__) or defined(__gfx95__) if constexpr(std::is_same_v && std::is_same_v) c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); else if constexpr(std::is_same_v && std::is_same_v) c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); else if constexpr(std::is_same_v && std::is_same_v) c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); else if constexpr(std::is_same_v && std::is_same_v) c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); #else ck_tile::ignore = c_vec; ck_tile::ignore = a_vec; @@ -1119,16 +1119,16 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base #if defined(__gfx94__) or defined(__gfx95__) if constexpr(std::is_same_v && std::is_same_v) return bit_cast(__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8( - bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); + bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); else if constexpr(std::is_same_v && std::is_same_v) return bit_cast(__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8( - bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); + bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); else if constexpr(std::is_same_v && std::is_same_v) return bit_cast(__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8( - bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); + bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); else if constexpr(std::is_same_v && std::is_same_v) return bit_cast(__builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8( - bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); + bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); #else ck_tile::ignore = a_vec; ck_tile::ignore = b_vec; @@ -1254,16 +1254,16 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base #if defined(__gfx94__) or defined(__gfx95__) if constexpr(std::is_same_v && std::is_same_v) c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); else if constexpr(std::is_same_v && std::is_same_v) c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); else if constexpr(std::is_same_v && std::is_same_v) c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); else if constexpr(std::is_same_v && std::is_same_v) c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); #elif defined(__gfx908__) || defined(__gfx90a__) static_for<0, 8, 1>{}([&](auto k) { float a_f32 = @@ -1289,16 +1289,16 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base #if defined(__gfx94__) or defined(__gfx95__) if constexpr(std::is_same_v && std::is_same_v) return bit_cast(__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( - bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); + bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); else if constexpr(std::is_same_v && std::is_same_v) return bit_cast(__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8( - bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); + bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); else if constexpr(std::is_same_v && std::is_same_v) return bit_cast(__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8( - bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); + bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); else if constexpr(std::is_same_v && std::is_same_v) return bit_cast(__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8( - bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); + bit_cast(a_vec), bit_cast(b_vec), CVecType{0.f}, 0, 0, 0)); #elif defined(__gfx908__) || defined(__gfx90a__) CVecType c_vec{0.f}; static_for<0, 8, 1>{}([&](auto k) { @@ -1580,7 +1580,7 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8 { #if defined(__gfx94__) or defined(__gfx95__) c_vec = __builtin_amdgcn_mfma_i32_32x32x16_i8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); #elif defined(__gfx908__) || defined(__gfx90a__) static_for<0, 8, 1>{}([&](auto k) { float a_f32 = @@ -1650,7 +1650,7 @@ struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8 { #if defined(__gfx94__) or defined(__gfx95__) c_vec = __builtin_amdgcn_mfma_i32_16x16x32_i8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); #else ck_tile::ignore = c_vec; ck_tile::ignore = a_vec; @@ -1709,7 +1709,7 @@ struct WarpGemmAttributeMfmaImpl_i32_16x16x64_i8 { #if defined(__gfx95__) c_vec = __builtin_amdgcn_mfma_i32_16x16x64_i8( - bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); + bit_cast(a_vec), bit_cast(b_vec), c_vec, 0, 0, 0); #else ck_tile::ignore = c_vec; ck_tile::ignore = a_vec; @@ -1767,8 +1767,8 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x32_i8 else { #if defined(__gfx95__) - c_vec = - __builtin_amdgcn_mfma_i32_32x32x32_i8(a_vec, bit_cast(b_vec), c_vec, 0, 0, 0); + c_vec = __builtin_amdgcn_mfma_i32_32x32x32_i8( + a_vec, bit_cast(b_vec), c_vec, 0, 0, 0); #else ck_tile::ignore = c_vec; ck_tile::ignore = a_vec; diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp index 1373dbc497..d2a38b2a81 100644 --- a/profiler/include/profiler/profile_gemm_impl.hpp +++ b/profiler/include/profiler/profile_gemm_impl.hpp @@ -6,7 +6,9 @@ #include #include #include +#if defined(__unix__) #include +#endif #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" @@ -213,7 +215,9 @@ int profile_gemm_impl(int do_verification, instance_id++; } +#if defined(__unix__) sleep(2); +#endif // Run the best instance again { diff --git a/profiler/src/profile_batched_gemm_b_scale.cpp b/profiler/src/profile_batched_gemm_b_scale.cpp index f768a17570..5fe6f490be 100644 --- a/profiler/src/profile_batched_gemm_b_scale.cpp +++ b/profiler/src/profile_batched_gemm_b_scale.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "profiler/profile_batched_gemm_b_scale_impl.hpp" #include "profiler_operation_registry.hpp" @@ -114,7 +115,7 @@ int profile_batched_gemm_b_scale(int argc, char* argv[]) n_iter = std::stoi(argv[18]); rotating = std::stoull(argv[19]) * 1024 * 1024; - printf("n_warmup:%d, n_iter:%d, rotating:%lu\n", n_warmup, n_iter, rotating); + printf("n_warmup:%d, n_iter:%d, rotating:%" PRIu64 "\n", n_warmup, n_iter, rotating); } using F32 = float; diff --git a/profiler/src/profile_gemm_b_scale.cpp b/profiler/src/profile_gemm_b_scale.cpp index 443ebff834..7bcc96a434 100644 --- a/profiler/src/profile_gemm_b_scale.cpp +++ b/profiler/src/profile_gemm_b_scale.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "profiler/profile_gemm_b_scale_impl.hpp" #include "profiler_operation_registry.hpp" @@ -100,7 +101,7 @@ int profile_gemm_b_scale(int argc, char* argv[]) n_iter = std::stoi(argv[17]); rotating = std::stoull(argv[18]) * 1024 * 1024; - printf("n_warmup:%d, n_iter:%d, rotating:%lu\n", n_warmup, n_iter, rotating); + printf("n_warmup:%d, n_iter:%d, rotating:%" PRIu64 "\n", n_warmup, n_iter, rotating); } using F32 = float; diff --git a/test/scatter_gather/scatter_gather.cpp b/test/scatter_gather/scatter_gather.cpp index 81765b43e5..874c4d86c0 100644 --- a/test/scatter_gather/scatter_gather.cpp +++ b/test/scatter_gather/scatter_gather.cpp @@ -140,8 +140,8 @@ union pixel { struct __attribute__((packed)) { - unsigned int r : 6; - unsigned int c : 10; + ushort r : 6; + ushort c : 10; }; ushort data; }; From 84e926d1ba2d12cad612954a9ecec03381bf76b0 Mon Sep 17 00:00:00 2001 From: amd-khushbu Date: Wed, 16 Jul 2025 21:17:18 +0000 Subject: [PATCH 4/4] Fixing numerical error, and interchange preshuffle configs to match with flatmm --- example/ck_tile/03_gemm/gemm_utils.hpp | 8 ++++---- example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp | 2 +- example/ck_tile/03_gemm/run_gemm_example.inc | 12 ++++++++++-- example/ck_tile/18_flatmm/run_flatmm_example.inc | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp index 9deccc7f16..7a9b5afaa2 100644 --- a/example/ck_tile/03_gemm/gemm_utils.hpp +++ b/example/ck_tile/03_gemm/gemm_utils.hpp @@ -241,8 +241,8 @@ struct GemmConfigPreshufle_1 : public GemmConfigBase static constexpr ck_tile::index_t N_Warp = 4; static constexpr ck_tile::index_t K_Warp = 1; - static constexpr ck_tile::index_t M_Warp_Tile = 32; - static constexpr ck_tile::index_t N_Warp_Tile = 32; + static constexpr ck_tile::index_t M_Warp_Tile = 16; + static constexpr ck_tile::index_t N_Warp_Tile = 16; static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm(); static constexpr int kBlockPerCu = 2; @@ -263,8 +263,8 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase static constexpr ck_tile::index_t N_Warp = 4; static constexpr ck_tile::index_t K_Warp = 1; - static constexpr ck_tile::index_t M_Warp_Tile = 16; - static constexpr ck_tile::index_t N_Warp_Tile = 16; + static constexpr ck_tile::index_t M_Warp_Tile = 32; + static constexpr ck_tile::index_t N_Warp_Tile = 32; static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm(); static constexpr int kBlockPerCu = 2; diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp index f57c24f458..b7b0701080 100644 --- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp +++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp @@ -220,7 +220,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a auto [result, arg_parser] = create_args(argc, argv); bool preshuffle = GemmConfig::Preshuffle; - if(preshuffle && a_layout != "R" && b_layout != "C") + if(preshuffle && (a_layout != "R" || b_layout != "C")) { throw std::runtime_error( "Preshuffle is supported only for A(Row major), B(column major) input matrices!"); diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index f13a4b693b..83836117e9 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -315,8 +315,16 @@ int run_gemm_example_with_layouts(int argc, if(init_method == 0) { - ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); - ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + if constexpr(preshuffle) + { + ck_tile::FillUniformDistribution{-.5f, .5f}(a_m_k); + ck_tile::FillUniformDistribution{-.5f, .5f}(b_k_n); + } + else + { + ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); + ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + } } else if(init_method == 1) { diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc index b583612cfb..8f39b07be5 100644 --- a/example/ck_tile/18_flatmm/run_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc @@ -18,7 +18,7 @@ constexpr const char* DataTypeToString() { return "bf8"; } - else if constexpr(std::is_same_v) + else if constexpr(std::is_same_v) { return "bf16"; }