Merge commit 'fb96b49666ddd4d7ccfd3528b1859796657e1a6b' into develop

This commit is contained in:
assistant-librarian[bot]
2025-08-04 19:13:03 +00:00
parent d514464d6f
commit b29d10bf79
5 changed files with 43 additions and 26 deletions

View File

@@ -98,6 +98,12 @@ add_compile_options(-Wno-pass-failed)
add_compile_options(-Wno-switch-default)
add_compile_options(-Wno-unique-object-duplication)
# add -Og -gdwarf64 for debug builds
add_compile_options(
"$<$<CONFIG:Debug>:-Og>"
"$<$<CONFIG:Debug>:-gdwarf64>"
)
# Recent change in compiler makes this warning ON by default, which led to compile errors.
add_compile_options(-Wno-nrvo)

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
@@ -14,7 +14,7 @@ CK_TILE_HOST void
reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
{
index_t rank = x.get_num_of_dimension();
assert(rank == y.get_num_of_dimension());
assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
assert(dim == -1 || dim < rank);
index_t target_dim = dim == -1 ? (rank - 1) : dim;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
@@ -38,8 +38,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
{
// rank must be the same
index_t rank = x.get_num_of_dimension();
assert(rank == y_values.get_num_of_dimension());
assert(rank == y_indices.get_num_of_dimension());
assert(static_cast<std::size_t>(rank) == y_values.get_num_of_dimension());
assert(static_cast<size_t>(rank) == y_indices.get_num_of_dimension());
assert(dim == -1 || dim < rank);
index_t topk_dim = dim == -1 ? (rank - 1) : dim;
@@ -47,7 +47,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
auto x_len = x.get_lengths();
assert(k <= topk_src_len);
assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim));
assert(static_cast<size_t>(k) == y_values.get_length(topk_dim) &&
static_cast<size_t>(k) == y_indices.get_length(topk_dim));
index_t n_parallel = x.get_element_size() / topk_src_len;

View File

@@ -175,6 +175,10 @@ function(add_instance_library INSTANCE_NAME)
target_compile_features(${INSTANCE_NAME} PUBLIC)
# splits debug information into separate .dwo files to reduce debug section size
if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
target_compile_options(${INSTANCE_NAME} PRIVATE -gsplit-dwarf)
endif()
# flags to compress the library
if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
message(DEBUG "Adding --offload-compress flag for ${INSTANCE_NAME}")

View File

@@ -187,11 +187,11 @@ __device__ AFragT load_A_col_major(AType const* input_ptr)
auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M);
auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M);
using ARawT = typename scalar_type<AFragT>::type;
using AScalarFragT =
vector_type<ARawT,
BLOCK_M * BLOCK_K / WAVE_SIZE /
(ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
using ARawT = typename scalar_type<AFragT>::type;
using AScalarFragT = typename vector_type<
ARawT,
BLOCK_M * BLOCK_K / WAVE_SIZE /
(ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
AScalarFragT fragA{};
@@ -319,8 +319,9 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
// Flatten to 1D row_major offsets.
auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
using ARawT = typename scalar_type<AFragT>::type;
using AScalarChunkT = vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
using ARawT = typename scalar_type<AFragT>::type;
using AScalarChunkT =
typename vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
union
{
@@ -544,8 +545,9 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col
using BRawT = typename scalar_type<BFragT>::type;
using BScalarChunkT = vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
using BRawT = typename scalar_type<BFragT>::type;
using BScalarChunkT =
typename vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
union
{
@@ -780,7 +782,7 @@ struct store_C_col_major<CType, CFragT, 32, 32>
// we can vector store 4 contiguous elements at a time.
using CRawT = typename scalar_type<CFragT>::type;
using CScalarFragT = vector_type<CRawT, VW>::type;
using CScalarFragT = typename vector_type<CRawT, VW>::type;
union
{
CFragT frag;
@@ -940,12 +942,14 @@ __global__ void matmul(const packed_type_t<AType>* a, const packed_type_t<BType>
assert(threadIdx.x < WAVE_SIZE);
assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
using AFragT =
typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
using BFragT =
typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
using CFragT = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using CFragT = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using AccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
// Create frags
auto fragA = AFragT{};
@@ -1019,14 +1023,16 @@ __global__ void matmul(const packed_type_t<AType>* a,
assert(threadIdx.x < WAVE_SIZE);
assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
using AFragT =
typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
using BFragT =
typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
using CFragT = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using CFragT = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using AccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using AScaleFragT = vector_type<ScaleType, 1>::type;
using BScaleFragT = vector_type<ScaleType, 1>::type;
using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
using AScaleFragT = typename vector_type<ScaleType, 1>::type;
using BScaleFragT = typename vector_type<ScaleType, 1>::type;
// Create frags
auto fragA = AFragT{};