mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
tidy
This commit is contained in:
@@ -80,16 +80,16 @@ __global__ void
|
|||||||
// first cast void CONSTANT void* to void*
|
// first cast void CONSTANT void* to void*
|
||||||
// second cast void* to Desc*
|
// second cast void* to Desc*
|
||||||
// the copy constructor of tensor descriptor doesn't take address_space(4)
|
// the copy constructor of tensor descriptor doesn't take address_space(4)
|
||||||
const auto a_k_m0_m1_grid_desc =
|
const auto a_k_m0_m1_grid_desc = *reinterpret_cast<const AKM0M1GridDesc*>(
|
||||||
*reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
|
cast_pointer_to_generic_address_space(p_a_k_m0_m1_grid_desc));
|
||||||
const auto b_k_n0_n1_grid_desc =
|
const auto b_k_n0_n1_grid_desc = *reinterpret_cast<const BKN0N1GridDesc*>(
|
||||||
*reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
|
cast_pointer_to_generic_address_space(p_b_k_n0_n1_grid_desc));
|
||||||
const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
|
const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
|
||||||
*reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
|
*reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
|
||||||
(const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
|
cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc));
|
||||||
const auto c_blockid_to_m0_n0_block_cluster_adaptor =
|
const auto c_blockid_to_m0_n0_block_cluster_adaptor =
|
||||||
*reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
|
*reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
|
||||||
(const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
|
cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor));
|
||||||
|
|
||||||
constexpr index_t shared_block_size =
|
constexpr index_t shared_block_size =
|
||||||
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
|
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
|
||||||
|
|||||||
@@ -80,16 +80,16 @@ __global__ void
|
|||||||
// first cast void CONSTANT void* to void*
|
// first cast void CONSTANT void* to void*
|
||||||
// second cast void* to Desc*
|
// second cast void* to Desc*
|
||||||
// the copy constructor of tensor descriptor doesn't take address_space(4)
|
// the copy constructor of tensor descriptor doesn't take address_space(4)
|
||||||
const auto a_k0_m0_m1_k1_grid_desc =
|
const auto a_k0_m0_m1_k1_grid_desc = *reinterpret_cast<const AK0M0M1K1GridDesc*>(
|
||||||
*reinterpret_cast<const AK0M0M1K1GridDesc*>((const void*)p_a_k0_m0_m1_k1_grid_desc);
|
cast_pointer_to_generic_address_space(p_a_k0_m0_m1_k1_grid_desc));
|
||||||
const auto b_k0_n0_n1_k1_grid_desc =
|
const auto b_k0_n0_n1_k1_grid_desc = *reinterpret_cast<const BK0N0N1K1GridDesc*>(
|
||||||
*reinterpret_cast<const BK0N0N1K1GridDesc*>((const void*)p_b_k0_n0_n1_k1_grid_desc);
|
cast_pointer_to_generic_address_space(p_b_k0_n0_n1_k1_grid_desc));
|
||||||
const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
|
const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
|
||||||
*reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
|
*reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
|
||||||
(const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
|
cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc));
|
||||||
const auto c_blockid_to_m0_n0_block_cluster_adaptor =
|
const auto c_blockid_to_m0_n0_block_cluster_adaptor =
|
||||||
*reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
|
*reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
|
||||||
(const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
|
cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor));
|
||||||
|
|
||||||
constexpr index_t shared_block_size =
|
constexpr index_t shared_block_size =
|
||||||
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
|
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
|
||||||
|
|||||||
@@ -69,14 +69,14 @@ __global__ void
|
|||||||
constexpr index_t shared_block_size =
|
constexpr index_t shared_block_size =
|
||||||
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
|
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
|
||||||
|
|
||||||
const auto a_k0_m_k1_grid_desc =
|
const auto a_k0_m_k1_grid_desc = *reinterpret_cast<const AK0MK1GridDesc*>(
|
||||||
*reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
|
cast_pointer_to_generic_address_space(p_a_k0_m_k1_grid_desc));
|
||||||
const auto b_k0_n_k1_grid_desc =
|
const auto b_k0_n_k1_grid_desc = *reinterpret_cast<const BK0NK1GridDesc*>(
|
||||||
*reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
|
cast_pointer_to_generic_address_space(p_b_k0_n_k1_grid_desc));
|
||||||
const auto c_m0_m1_m2_n_grid_desc =
|
const auto c_m0_m1_m2_n_grid_desc = *reinterpret_cast<const CM0M1M2NGridDesc*>(
|
||||||
*reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
|
cast_pointer_to_generic_address_space(p_c_m0_m1_m2_n_grid_desc));
|
||||||
const auto c_block_cluster_adaptor =
|
const auto c_block_cluster_adaptor = *reinterpret_cast<const CBlockClusterAdaptor*>(
|
||||||
*reinterpret_cast<const CBlockClusterAdaptor*>((const void*)p_c_block_cluster_adaptor);
|
cast_pointer_to_generic_address_space(p_c_block_cluster_adaptor));
|
||||||
|
|
||||||
__shared__ FloatAB p_shared_block[shared_block_size];
|
__shared__ FloatAB p_shared_block[shared_block_size];
|
||||||
|
|
||||||
|
|||||||
25
composable_kernel/include/utility/amd_address_space.hpp
Normal file
25
composable_kernel/include/utility/amd_address_space.hpp
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
#ifndef CK_AMD_ADDRESS_SPACE_HPP
|
||||||
|
#define CK_AMD_ADDRESS_SPACE_HPP
|
||||||
|
|
||||||
|
#include "config.hpp"
|
||||||
|
|
||||||
|
namespace ck {
|
||||||
|
|
||||||
|
enum AddressSpaceEnum_t
|
||||||
|
{
|
||||||
|
Generic,
|
||||||
|
Global,
|
||||||
|
Lds,
|
||||||
|
Sgpr,
|
||||||
|
Vgpr,
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p)
|
||||||
|
{
|
||||||
|
return (T*)p;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace ck
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -1,12 +1,12 @@
|
|||||||
#ifndef CK_AMD_BUFFER_ADDRESSING_V2_HPP
|
#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
|
||||||
#define CK_AMD_BUFFER_ADDRESSING_V2_HPP
|
#define CK_AMD_BUFFER_ADDRESSING_HPP
|
||||||
|
|
||||||
#include "data_type.hpp"
|
#include "data_type.hpp"
|
||||||
|
|
||||||
namespace ck {
|
namespace ck {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
union BufferResource_v2
|
union BufferResource
|
||||||
{
|
{
|
||||||
// 128 bit SGPRs to supply buffer resource in buffer instructions
|
// 128 bit SGPRs to supply buffer resource in buffer instructions
|
||||||
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
|
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
|
||||||
@@ -19,7 +19,7 @@ union BufferResource_v2
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size)
|
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size)
|
||||||
{
|
{
|
||||||
BufferResource_v2<T> wave_buffer_resource;
|
BufferResource<T> wave_buffer_resource;
|
||||||
|
|
||||||
// wavewise base address (64 bit)
|
// wavewise base address (64 bit)
|
||||||
wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
|
wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
|
||||||
@@ -23,9 +23,10 @@
|
|||||||
#include "tuple.hpp"
|
#include "tuple.hpp"
|
||||||
#include "tuple_helper.hpp"
|
#include "tuple_helper.hpp"
|
||||||
#include "type.hpp"
|
#include "type.hpp"
|
||||||
#include "utility.hpp"
|
|
||||||
#include "magic_division.hpp"
|
#include "magic_division.hpp"
|
||||||
#include "amd_buffer_addressing_v2.hpp"
|
#include "utility.hpp"
|
||||||
|
#include "amd_address_space.hpp"
|
||||||
|
#include "amd_buffer_addressing.hpp"
|
||||||
#include "static_buffer.hpp"
|
#include "static_buffer.hpp"
|
||||||
#include "dynamic_buffer.hpp"
|
#include "dynamic_buffer.hpp"
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
#endif
|
#endif
|
||||||
#include "bfloat16_dev.hpp"
|
#include "bfloat16_dev.hpp"
|
||||||
|
|
||||||
// address space for kernel parameter
|
// "Constant" address space for kernel parameter
|
||||||
#define CONSTANT __attribute__((address_space(4)))
|
#define CONSTANT __attribute__((address_space(4)))
|
||||||
|
|
||||||
// GPU target
|
// GPU target
|
||||||
@@ -120,15 +120,6 @@
|
|||||||
|
|
||||||
namespace ck {
|
namespace ck {
|
||||||
|
|
||||||
enum AddressSpaceEnum_t
|
|
||||||
{
|
|
||||||
Generic,
|
|
||||||
Global,
|
|
||||||
Lds,
|
|
||||||
Sgpr,
|
|
||||||
Vgpr
|
|
||||||
};
|
|
||||||
|
|
||||||
enum InMemoryDataOperationEnum_t
|
enum InMemoryDataOperationEnum_t
|
||||||
{
|
{
|
||||||
Set,
|
Set,
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
namespace ck {
|
namespace ck {
|
||||||
|
|
||||||
#include "amd_buffer_addressing_v2.hpp"
|
#include "amd_buffer_addressing.hpp"
|
||||||
|
|
||||||
template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
|
template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
|
||||||
struct DynamicBuffer
|
struct DynamicBuffer
|
||||||
|
|||||||
Reference in New Issue
Block a user