From 80120f0a0c524d1efc0249926a73d5020f0efd67 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 9 Aug 2021 21:10:09 +0000 Subject: [PATCH] tidy --- .../gridwise_dynamic_gemm_dlops_v1r2.hpp | 12 ++++----- .../gridwise_dynamic_gemm_dlops_v1r3.hpp | 12 ++++----- .../gridwise_dynamic_gemm_xdlops_v2r3.hpp | 16 ++++++------ .../include/utility/amd_address_space.hpp | 25 +++++++++++++++++++ ...ssing_v2.hpp => amd_buffer_addressing.hpp} | 8 +++--- .../include/utility/common_header.hpp | 5 ++-- composable_kernel/include/utility/config.hpp | 11 +------- .../include/utility/dynamic_buffer.hpp | 2 +- 8 files changed, 54 insertions(+), 37 deletions(-) create mode 100644 composable_kernel/include/utility/amd_address_space.hpp rename composable_kernel/include/utility/{amd_buffer_addressing_v2.hpp => amd_buffer_addressing.hpp} (99%) diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp index 2c45e42a0e..e4858af492 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp @@ -80,16 +80,16 @@ __global__ void // first cast void CONSTANT void* to void* // second cast void* to Desc* // the copy constructor of tensor descriptor doesn't take address_space(4) - const auto a_k_m0_m1_grid_desc = - *reinterpret_cast((const void*)p_a_k_m0_m1_grid_desc); - const auto b_k_n0_n1_grid_desc = - *reinterpret_cast((const void*)p_b_k_n0_n1_grid_desc); + const auto a_k_m0_m1_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_a_k_m0_m1_grid_desc)); + const auto b_k_n0_n1_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_b_k_n0_n1_grid_desc)); const auto c_m0_m10_m11_n0_n10_n11_grid_desc = *reinterpret_cast( - (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc); + cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc)); const auto c_blockid_to_m0_n0_block_cluster_adaptor = *reinterpret_cast( - (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor); + cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor)); constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp index 5de41b1f7c..244c376cf8 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp @@ -80,16 +80,16 @@ __global__ void // first cast void CONSTANT void* to void* // second cast void* to Desc* // the copy constructor of tensor descriptor doesn't take address_space(4) - const auto a_k0_m0_m1_k1_grid_desc = - *reinterpret_cast((const void*)p_a_k0_m0_m1_k1_grid_desc); - const auto b_k0_n0_n1_k1_grid_desc = - *reinterpret_cast((const void*)p_b_k0_n0_n1_k1_grid_desc); + const auto a_k0_m0_m1_k1_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_a_k0_m0_m1_k1_grid_desc)); + const auto b_k0_n0_n1_k1_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_b_k0_n0_n1_k1_grid_desc)); const auto c_m0_m10_m11_n0_n10_n11_grid_desc = *reinterpret_cast( - (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc); + cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc)); const auto c_blockid_to_m0_n0_block_cluster_adaptor = *reinterpret_cast( - (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor); + cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor)); constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); diff --git a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp index 124623c702..3a8883b460 100644 --- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp @@ -69,14 +69,14 @@ __global__ void constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - const auto a_k0_m_k1_grid_desc = - *reinterpret_cast((const void*)p_a_k0_m_k1_grid_desc); - const auto b_k0_n_k1_grid_desc = - *reinterpret_cast((const void*)p_b_k0_n_k1_grid_desc); - const auto c_m0_m1_m2_n_grid_desc = - *reinterpret_cast((const void*)p_c_m0_m1_m2_n_grid_desc); - const auto c_block_cluster_adaptor = - *reinterpret_cast((const void*)p_c_block_cluster_adaptor); + const auto a_k0_m_k1_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_a_k0_m_k1_grid_desc)); + const auto b_k0_n_k1_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_b_k0_n_k1_grid_desc)); + const auto c_m0_m1_m2_n_grid_desc = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_c_m0_m1_m2_n_grid_desc)); + const auto c_block_cluster_adaptor = *reinterpret_cast( + cast_pointer_to_generic_address_space(p_c_block_cluster_adaptor)); __shared__ FloatAB p_shared_block[shared_block_size]; diff --git a/composable_kernel/include/utility/amd_address_space.hpp b/composable_kernel/include/utility/amd_address_space.hpp new file mode 100644 index 0000000000..f9bb6a5133 --- /dev/null +++ b/composable_kernel/include/utility/amd_address_space.hpp @@ -0,0 +1,25 @@ +#ifndef CK_AMD_ADDRESS_SPACE_HPP +#define CK_AMD_ADDRESS_SPACE_HPP + +#include "config.hpp" + +namespace ck { + +enum AddressSpaceEnum_t +{ + Generic, + Global, + Lds, + Sgpr, + Vgpr, +}; + +template +__device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p) +{ + return (T*)p; +} + +} // namespace ck + +#endif diff --git a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp similarity index 99% rename from composable_kernel/include/utility/amd_buffer_addressing_v2.hpp rename to composable_kernel/include/utility/amd_buffer_addressing.hpp index 0139bceb61..711af2e648 100644 --- a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp +++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp @@ -1,12 +1,12 @@ -#ifndef CK_AMD_BUFFER_ADDRESSING_V2_HPP -#define CK_AMD_BUFFER_ADDRESSING_V2_HPP +#ifndef CK_AMD_BUFFER_ADDRESSING_HPP +#define CK_AMD_BUFFER_ADDRESSING_HPP #include "data_type.hpp" namespace ck { template -union BufferResource_v2 +union BufferResource { // 128 bit SGPRs to supply buffer resource in buffer instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions @@ -19,7 +19,7 @@ union BufferResource_v2 template __device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size) { - BufferResource_v2 wave_buffer_resource; + BufferResource wave_buffer_resource; // wavewise base address (64 bit) wave_buffer_resource.address(Number<0>{}) = const_cast*>(p_wave); diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 5ff7688a1c..39b400b3cd 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -23,9 +23,10 @@ #include "tuple.hpp" #include "tuple_helper.hpp" #include "type.hpp" -#include "utility.hpp" #include "magic_division.hpp" -#include "amd_buffer_addressing_v2.hpp" +#include "utility.hpp" +#include "amd_address_space.hpp" +#include "amd_buffer_addressing.hpp" #include "static_buffer.hpp" #include "dynamic_buffer.hpp" diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp index 4908d8d818..547d1fadbe 100644 --- a/composable_kernel/include/utility/config.hpp +++ b/composable_kernel/include/utility/config.hpp @@ -7,7 +7,7 @@ #endif #include "bfloat16_dev.hpp" -// address space for kernel parameter +// "Constant" address space for kernel parameter #define CONSTANT __attribute__((address_space(4))) // GPU target @@ -120,15 +120,6 @@ namespace ck { -enum AddressSpaceEnum_t -{ - Generic, - Global, - Lds, - Sgpr, - Vgpr -}; - enum InMemoryDataOperationEnum_t { Set, diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp index 5f5f386306..34c28d7fef 100644 --- a/composable_kernel/include/utility/dynamic_buffer.hpp +++ b/composable_kernel/include/utility/dynamic_buffer.hpp @@ -3,7 +3,7 @@ namespace ck { -#include "amd_buffer_addressing_v2.hpp" +#include "amd_buffer_addressing.hpp" template struct DynamicBuffer