mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-22 08:07:38 +00:00
1. Add base class GridwiseGemm_xdl_cshuffle_base for all gridwise_gemm_xdl classes. - to select correct LDS layout and epilogue behavior , three additional parameters is added. - ForceNaiveLdsLayout: disable XOR based LDS layout when it is true - DirectLoad: pipeline only use directload, we need force naive layout and ignore any padding on gfx9 - IsMxGemm: epilogue has two addtional dimensions 2. Move all LDS descriptor layout related fucntion to base class, including - GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1 - GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1 - GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock 3. Move several LDS related helper funtions to base class, including - GetSharedMemoryNumberOfByte - GetABlockDescriptor_AKB_AK0PerBlock_MPerBlock_AK1 - GetBBlockDescriptor_BKB_BK0PerBlock_NPerBlock_BK1 - GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl 4. Move all c epilogue related code to base class, and 4 kind of implementation are provided - RunEpilogueNoShuffle - RunEpilogue - RunMultiDEpilogue - RunMoeEpilogue
59 lines
1.8 KiB
C++
59 lines
1.8 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/utility/array.hpp"
|
|
#include "ck/utility/container_helper.hpp"
|
|
#include "ck/utility/statically_indexed_array.hpp"
|
|
#include "ck/utility/container_element_picker.hpp"
|
|
#include "ck/utility/multi_index.hpp"
|
|
#include "ck/utility/data_type.hpp"
|
|
#include "ck/utility/functional.hpp"
|
|
#include "ck/utility/functional2.hpp"
|
|
#include "ck/utility/functional3.hpp"
|
|
#include "ck/utility/functional4.hpp"
|
|
#include "ck/utility/enable_if.hpp"
|
|
#include "ck/utility/ignore.hpp"
|
|
#include "ck/utility/integral_constant.hpp"
|
|
#include "ck/utility/math.hpp"
|
|
#include "ck/utility/number.hpp"
|
|
#include "ck/utility/sequence.hpp"
|
|
#include "ck/utility/sequence_helper.hpp"
|
|
#include "ck/utility/tuple.hpp"
|
|
#include "ck/utility/tuple_helper.hpp"
|
|
#include "ck/utility/type.hpp"
|
|
#include "ck/utility/type_convert.hpp"
|
|
#include "ck/utility/magic_division.hpp"
|
|
#include "ck/utility/c_style_pointer_cast.hpp"
|
|
#include "ck/utility/is_known_at_compile_time.hpp"
|
|
#include "ck/utility/transpose_vectors.hpp"
|
|
#include "ck/utility/inner_product.hpp"
|
|
#include "ck/utility/thread_group.hpp"
|
|
#include "ck/utility/debug.hpp"
|
|
|
|
#include "ck/utility/amd_arch.hpp"
|
|
#if __clang_major__ >= 20
|
|
#include "amd_buffer_addressing_builtins.hpp"
|
|
#else
|
|
#include "amd_buffer_addressing.hpp"
|
|
#endif
|
|
#include "ck/utility/amd_wave_read_first_lane.hpp"
|
|
#include "ck/utility/generic_memory_space_atomic.hpp"
|
|
#include "ck/utility/get_id.hpp"
|
|
#include "ck/utility/thread_group.hpp"
|
|
#include "ck/utility/synchronization.hpp"
|
|
#include "ck/utility/amd_address_space.hpp"
|
|
#include "ck/utility/static_buffer.hpp"
|
|
#include "ck/utility/dynamic_buffer.hpp"
|
|
|
|
// TODO: remove this
|
|
#if CK_USE_AMD_INLINE_ASM
|
|
#include "ck/utility/amd_inline_asm.hpp"
|
|
#endif
|
|
|
|
#ifdef CK_USE_AMD_MFMA
|
|
#include "ck/utility/amd_xdlops.hpp"
|
|
#endif
|