From b29d10bf7959d49977e6f94ebd9ef17e62f6a437 Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Mon, 4 Aug 2025 19:13:03 +0000 Subject: [PATCH] Merge commit 'fb96b49666ddd4d7ccfd3528b1859796657e1a6b' into develop --- CMakeLists.txt | 6 +++ .../host/reference/reference_softmax.hpp | 4 +- .../ck_tile/host/reference/reference_topk.hpp | 9 ++-- .../gpu/CMakeLists.txt | 4 ++ test/mx_mfma_op/mx_mfma_op.hpp | 46 +++++++++++-------- 5 files changed, 43 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f49376d139..19c036e1a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,6 +98,12 @@ add_compile_options(-Wno-pass-failed) add_compile_options(-Wno-switch-default) add_compile_options(-Wno-unique-object-duplication) +# add -Og -gdwarf64 for debug builds +add_compile_options( + "$<$:-Og>" + "$<$:-gdwarf64>" +) + # Recent change in compiler makes this warning ON by default, which led to compile errors. add_compile_options(-Wno-nrvo) diff --git a/include/ck_tile/host/reference/reference_softmax.hpp b/include/ck_tile/host/reference/reference_softmax.hpp index d86e879944..4e729c437d 100644 --- a/include/ck_tile/host/reference/reference_softmax.hpp +++ b/include/ck_tile/host/reference/reference_softmax.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -14,7 +14,7 @@ CK_TILE_HOST void reference_softmax(const HostTensor& x, HostTensor& y, index_t dim = -1) { index_t rank = x.get_num_of_dimension(); - assert(rank == y.get_num_of_dimension()); + assert(static_cast(rank) == y.get_num_of_dimension()); assert(dim == -1 || dim < rank); index_t target_dim = dim == -1 ? (rank - 1) : dim; diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp index 3d0404a2e5..0fc99a983a 100644 --- a/include/ck_tile/host/reference/reference_topk.hpp +++ b/include/ck_tile/host/reference/reference_topk.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -38,8 +38,8 @@ CK_TILE_HOST void reference_topk(const HostTensor& x, { // rank must be the same index_t rank = x.get_num_of_dimension(); - assert(rank == y_values.get_num_of_dimension()); - assert(rank == y_indices.get_num_of_dimension()); + assert(static_cast(rank) == y_values.get_num_of_dimension()); + assert(static_cast(rank) == y_indices.get_num_of_dimension()); assert(dim == -1 || dim < rank); index_t topk_dim = dim == -1 ? (rank - 1) : dim; @@ -47,7 +47,8 @@ CK_TILE_HOST void reference_topk(const HostTensor& x, auto x_len = x.get_lengths(); assert(k <= topk_src_len); - assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim)); + assert(static_cast(k) == y_values.get_length(topk_dim) && + static_cast(k) == y_indices.get_length(topk_dim)); index_t n_parallel = x.get_element_size() / topk_src_len; diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 5204b51edf..1eaaa7e6ba 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -175,6 +175,10 @@ function(add_instance_library INSTANCE_NAME) target_compile_features(${INSTANCE_NAME} PUBLIC) + # splits debug information into separate .dwo files to reduce debug section size + if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + target_compile_options(${INSTANCE_NAME} PRIVATE -gsplit-dwarf) + endif() # flags to compress the library if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132) message(DEBUG "Adding --offload-compress flag for ${INSTANCE_NAME}") diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp index 4bb38a0c16..b2e615b9d8 100644 --- a/test/mx_mfma_op/mx_mfma_op.hpp +++ b/test/mx_mfma_op/mx_mfma_op.hpp @@ -187,11 +187,11 @@ __device__ AFragT load_A_col_major(AType const* input_ptr) auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M); auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M); - using ARawT = typename scalar_type::type; - using AScalarFragT = - vector_type, ck::f4x2_pk_t> ? 2 : 1)>::type; + using ARawT = typename scalar_type::type; + using AScalarFragT = typename vector_type< + ARawT, + BLOCK_M * BLOCK_K / WAVE_SIZE / + (ck::is_same_v, ck::f4x2_pk_t> ? 2 : 1)>::type; AScalarFragT fragA{}; @@ -319,8 +319,9 @@ __device__ AFragT load_A_row_major(AType const* input_ptr) // Flatten to 1D row_major offsets. auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; }; - using ARawT = typename scalar_type::type; - using AScalarChunkT = vector_type::vector_size / num_chunks>::type; + using ARawT = typename scalar_type::type; + using AScalarChunkT = + typename vector_type::vector_size / num_chunks>::type; union { @@ -544,8 +545,9 @@ __device__ BFragT load_B_col_major(BType const* input_ptr) auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col - using BRawT = typename scalar_type::type; - using BScalarChunkT = vector_type::vector_size / num_chunks>::type; + using BRawT = typename scalar_type::type; + using BScalarChunkT = + typename vector_type::vector_size / num_chunks>::type; union { @@ -780,7 +782,7 @@ struct store_C_col_major // we can vector store 4 contiguous elements at a time. using CRawT = typename scalar_type::type; - using CScalarFragT = vector_type::type; + using CScalarFragT = typename vector_type::type; union { CFragT frag; @@ -940,12 +942,14 @@ __global__ void matmul(const packed_type_t* a, const packed_type_t assert(threadIdx.x < WAVE_SIZE); assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1); - using AFragT = vector_type::type; - using BFragT = vector_type::type; + using AFragT = + typename vector_type::type; + using BFragT = + typename vector_type::type; - using CFragT = vector_type::type; + using CFragT = typename vector_type::type; using AccumFragT = vector_type; - using RawAccumFragT = vector_type::type; + using RawAccumFragT = typename vector_type::type; // Create frags auto fragA = AFragT{}; @@ -1019,14 +1023,16 @@ __global__ void matmul(const packed_type_t* a, assert(threadIdx.x < WAVE_SIZE); assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1); - using AFragT = vector_type::type; - using BFragT = vector_type::type; + using AFragT = + typename vector_type::type; + using BFragT = + typename vector_type::type; - using CFragT = vector_type::type; + using CFragT = typename vector_type::type; using AccumFragT = vector_type; - using RawAccumFragT = vector_type::type; - using AScaleFragT = vector_type::type; - using BScaleFragT = vector_type::type; + using RawAccumFragT = typename vector_type::type; + using AScaleFragT = typename vector_type::type; + using BScaleFragT = typename vector_type::type; // Create frags auto fragA = AFragT{};