Files
composable_kernel/include/ck/utility/amd_arch.hpp
linqunAMD 23cefda140 [ck] add gridwise base class for in all xdl kernel (#186) (#3544)
1. Add base class GridwiseGemm_xdl_cshuffle_base for all gridwise_gemm_xdl classes.
- to select correct LDS layout and epilogue behavior , three additional parameters is added.
- ForceNaiveLdsLayout: disable XOR based LDS layout when it is true
- DirectLoad: pipeline only use directload, we need force naive layout and ignore any padding on gfx9
- IsMxGemm: epilogue has two addtional dimensions
2. Move all LDS descriptor layout related fucntion to base class, including
- GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
- GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1
- GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
3. Move several LDS related helper funtions to base class, including
- GetSharedMemoryNumberOfByte
- GetABlockDescriptor_AKB_AK0PerBlock_MPerBlock_AK1
- GetBBlockDescriptor_BKB_BK0PerBlock_NPerBlock_BK1
- GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
4. Move all c epilogue related code to base class, and 4 kind of implementation are provided
- RunEpilogueNoShuffle
- RunEpilogue
- RunMultiDEpilogue
- RunMoeEpilogue
2026-01-27 12:49:47 -08:00

86 lines
1.3 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
#pragma once
#include "ck/ck.hpp"
namespace ck {
// Architecture tags
struct gfx9_t
{
};
struct gfx950_t
{
};
struct gfx103_t
{
};
struct gfx11_t
{
};
struct gfx12_t
{
};
struct gfx_invalid_t
{
};
static constexpr auto get_device_arch()
{
#if defined(__gfx950__)
return gfx950_t{};
#elif defined(__gfx9__)
return gfx9_t{};
#elif defined(__gfx10__)
return gfx103_t{};
#elif defined(__gfx11__)
return gfx11_t{};
#elif defined(__gfx12__)
return gfx12_t{};
#else
return gfx_invalid_t{};
#endif
}
template <typename DeviceArch>
static constexpr index_t get_lds_size(DeviceArch)
{
return 64 * 1024;
}
template <>
constexpr index_t get_lds_size<gfx950_t>(gfx950_t)
{
return 160 * 1024;
}
template <typename DeviceArch>
static constexpr index_t get_n_lds_banks(DeviceArch)
{
return 32;
}
template <>
constexpr index_t get_n_lds_banks<gfx950_t>(gfx950_t)
{
return 64;
}
template <typename DeviceArch>
static constexpr index_t get_max_vgpr_count(DeviceArch)
{
return 256;
}
template <>
constexpr index_t get_max_vgpr_count<gfx950_t>(gfx950_t)
{
return 512;
}
template <>
constexpr index_t get_max_vgpr_count<gfx9_t>(gfx9_t)
{
return 512;
}
} // namespace ck