mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 02:54:21 +00:00
Merge commit 'fb96b49666ddd4d7ccfd3528b1859796657e1a6b' into develop
This commit is contained in:
@@ -98,6 +98,12 @@ add_compile_options(-Wno-pass-failed)
|
||||
add_compile_options(-Wno-switch-default)
|
||||
add_compile_options(-Wno-unique-object-duplication)
|
||||
|
||||
# add -Og -gdwarf64 for debug builds
|
||||
add_compile_options(
|
||||
"$<$<CONFIG:Debug>:-Og>"
|
||||
"$<$<CONFIG:Debug>:-gdwarf64>"
|
||||
)
|
||||
|
||||
# Recent change in compiler makes this warning ON by default, which led to compile errors.
|
||||
add_compile_options(-Wno-nrvo)
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -14,7 +14,7 @@ CK_TILE_HOST void
|
||||
reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
|
||||
{
|
||||
index_t rank = x.get_num_of_dimension();
|
||||
assert(rank == y.get_num_of_dimension());
|
||||
assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
|
||||
assert(dim == -1 || dim < rank);
|
||||
|
||||
index_t target_dim = dim == -1 ? (rank - 1) : dim;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -38,8 +38,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
|
||||
{
|
||||
// rank must be the same
|
||||
index_t rank = x.get_num_of_dimension();
|
||||
assert(rank == y_values.get_num_of_dimension());
|
||||
assert(rank == y_indices.get_num_of_dimension());
|
||||
assert(static_cast<std::size_t>(rank) == y_values.get_num_of_dimension());
|
||||
assert(static_cast<size_t>(rank) == y_indices.get_num_of_dimension());
|
||||
assert(dim == -1 || dim < rank);
|
||||
|
||||
index_t topk_dim = dim == -1 ? (rank - 1) : dim;
|
||||
@@ -47,7 +47,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
|
||||
auto x_len = x.get_lengths();
|
||||
|
||||
assert(k <= topk_src_len);
|
||||
assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim));
|
||||
assert(static_cast<size_t>(k) == y_values.get_length(topk_dim) &&
|
||||
static_cast<size_t>(k) == y_indices.get_length(topk_dim));
|
||||
|
||||
index_t n_parallel = x.get_element_size() / topk_src_len;
|
||||
|
||||
|
||||
@@ -175,6 +175,10 @@ function(add_instance_library INSTANCE_NAME)
|
||||
|
||||
target_compile_features(${INSTANCE_NAME} PUBLIC)
|
||||
|
||||
# splits debug information into separate .dwo files to reduce debug section size
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
|
||||
target_compile_options(${INSTANCE_NAME} PRIVATE -gsplit-dwarf)
|
||||
endif()
|
||||
# flags to compress the library
|
||||
if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
|
||||
message(DEBUG "Adding --offload-compress flag for ${INSTANCE_NAME}")
|
||||
|
||||
@@ -187,11 +187,11 @@ __device__ AFragT load_A_col_major(AType const* input_ptr)
|
||||
auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M);
|
||||
auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M);
|
||||
|
||||
using ARawT = typename scalar_type<AFragT>::type;
|
||||
using AScalarFragT =
|
||||
vector_type<ARawT,
|
||||
BLOCK_M * BLOCK_K / WAVE_SIZE /
|
||||
(ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
|
||||
using ARawT = typename scalar_type<AFragT>::type;
|
||||
using AScalarFragT = typename vector_type<
|
||||
ARawT,
|
||||
BLOCK_M * BLOCK_K / WAVE_SIZE /
|
||||
(ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
|
||||
|
||||
AScalarFragT fragA{};
|
||||
|
||||
@@ -319,8 +319,9 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
|
||||
// Flatten to 1D row_major offsets.
|
||||
auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
|
||||
|
||||
using ARawT = typename scalar_type<AFragT>::type;
|
||||
using AScalarChunkT = vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
|
||||
using ARawT = typename scalar_type<AFragT>::type;
|
||||
using AScalarChunkT =
|
||||
typename vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
|
||||
|
||||
union
|
||||
{
|
||||
@@ -544,8 +545,9 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
|
||||
|
||||
auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col
|
||||
|
||||
using BRawT = typename scalar_type<BFragT>::type;
|
||||
using BScalarChunkT = vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
|
||||
using BRawT = typename scalar_type<BFragT>::type;
|
||||
using BScalarChunkT =
|
||||
typename vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
|
||||
|
||||
union
|
||||
{
|
||||
@@ -780,7 +782,7 @@ struct store_C_col_major<CType, CFragT, 32, 32>
|
||||
|
||||
// we can vector store 4 contiguous elements at a time.
|
||||
using CRawT = typename scalar_type<CFragT>::type;
|
||||
using CScalarFragT = vector_type<CRawT, VW>::type;
|
||||
using CScalarFragT = typename vector_type<CRawT, VW>::type;
|
||||
union
|
||||
{
|
||||
CFragT frag;
|
||||
@@ -940,12 +942,14 @@ __global__ void matmul(const packed_type_t<AType>* a, const packed_type_t<BType>
|
||||
assert(threadIdx.x < WAVE_SIZE);
|
||||
assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
|
||||
|
||||
using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
|
||||
using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
|
||||
using AFragT =
|
||||
typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
|
||||
using BFragT =
|
||||
typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
|
||||
|
||||
using CFragT = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
|
||||
using CFragT = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
|
||||
using AccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
|
||||
using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
|
||||
using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
|
||||
|
||||
// Create frags
|
||||
auto fragA = AFragT{};
|
||||
@@ -1019,14 +1023,16 @@ __global__ void matmul(const packed_type_t<AType>* a,
|
||||
assert(threadIdx.x < WAVE_SIZE);
|
||||
assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
|
||||
|
||||
using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
|
||||
using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
|
||||
using AFragT =
|
||||
typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
|
||||
using BFragT =
|
||||
typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
|
||||
|
||||
using CFragT = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
|
||||
using CFragT = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
|
||||
using AccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
|
||||
using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
|
||||
using AScaleFragT = vector_type<ScaleType, 1>::type;
|
||||
using BScaleFragT = vector_type<ScaleType, 1>::type;
|
||||
using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
|
||||
using AScaleFragT = typename vector_type<ScaleType, 1>::type;
|
||||
using BScaleFragT = typename vector_type<ScaleType, 1>::type;
|
||||
|
||||
// Create frags
|
||||
auto fragA = AFragT{};
|
||||
|
||||
Reference in New Issue
Block a user