diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp index 5fd0aef6d6..06b7c7d324 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp @@ -54,6 +54,7 @@ __global__ void const ComputeBasePrtOfBatch compute_base_ptr_of_batch_, const Block2CTileMap block_2_ctile_map) { +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); @@ -88,6 +89,25 @@ __global__ void c_grid_desc_mblock_mperblock_nblock_nperblock, d_grid_desc_mblock_mperblock, block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = p_d0_grid; + ignore = p_d1_grid; + ignore = batch_count; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = d0_reduce_op; + ignore = d1_reduce_op; + ignore = a_grid_desc_ak0_m_ak1; + ignore = b_grid_desc_bk0_n_bk1; + ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = d_grid_desc_mblock_mperblock; + ignore = compute_base_ptr_of_batch_; + ignore = block_2_ctile_map; +#endif // end of if defined (defined(__gfx908__) || defined(__gfx90a__)) } template (p_a_grid, @@ -66,6 +67,23 @@ __global__ void c_grid_desc_mblock_mperblock_nblock_nperblock, d_grid_desc_mblock_mperblock, block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = p_d0_grid; + ignore = p_d1_grid; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = d0_reduce_op; + ignore = d1_reduce_op; + ignore = a_grid_desc_ak0_m_ak1; + ignore = b_grid_desc_bk0_n_bk1; + ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = d_grid_desc_mblock_mperblock; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) } template (p_a_grid, @@ -51,6 +52,18 @@ __global__ void b_grid_desc_bk0_n_bk1, c_grid_desc_mblock_mperblock_nblock_nperblock, block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = a_grid_desc_ak0_m_ak1; + ignore = b_grid_desc_bk0_n_bk1; + ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) } template (p_a_grid, @@ -52,6 +53,18 @@ __global__ void b_element_op, c_element_op, block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) } template ( @@ -56,6 +57,18 @@ __global__ void b_element_op, c_element_op, block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = a_grid_desc_ak0_m_ak1; + ignore = b_grid_desc_bk0_n_bk1; + ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) } template < diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp index 6d1d64eb15..51477cdb40 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp @@ -45,6 +45,7 @@ __global__ void const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) { +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run( @@ -61,6 +62,20 @@ __global__ void b_element_op, c_element_op, block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = p_c0_grid; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl; + ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) } template < diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp index da1b9bc6f1..fa6f1d1f6b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp @@ -49,6 +49,7 @@ __global__ void const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) { +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run( @@ -67,6 +68,22 @@ __global__ void b_element_op, c_element_op, block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = p_c0_grid; + ignore = p_c1_grid; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl; + ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl; + ignore = c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) } template < diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp index 2b50852f43..6d95aec938 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp @@ -36,6 +36,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe DataType value) { + using PassThroughOp = tensor_operation::element_wise::UnaryIdentic; constexpr auto I0 = Number<0>{}; diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index 45f387ef2a..c1bc937062 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -13,6 +13,7 @@ #include "functional3.hpp" #include "functional4.hpp" #include "enable_if.hpp" +#include "ignore.hpp" #include "integral_constant.hpp" #include "math.hpp" #include "number.hpp" @@ -30,6 +31,7 @@ #include "debug.hpp" #include "amd_buffer_addressing.hpp" +#include "generic_memory_space_atomic_add.hpp" #include "get_id.hpp" #include "synchronization.hpp" #include "amd_address_space.hpp" diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index f1e541313c..bf8dc74f34 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -992,77 +992,6 @@ inline __host__ __device__ bhalf_t type_convert(float x) return uint16_t(u.int32 >> 16); } -// TODO: deprecate this -template -struct inner_product_with_conversion -{ - template - __device__ T operator()(typename vector_type::type a, - typename vector_type::type b) const - { - const vector_type a_vector{a}; - const vector_type b_vector{b}; - - T acc = 0; - - static_for<0, N, 1>{}([&](auto i) { - acc += type_convert(a_vector.Scalars()[i]) * type_convert(b_vector.Scalars()[i]); - }); - - return acc; - } - - __device__ T operator()(float_t a, float_t b) const - { - return type_convert(a) * type_convert(b); - } - - __device__ T operator()(int8x4_t a, int8x4_t b) const - { - const vector_type a_vector{a}; - const vector_type b_vector{b}; - - T acc = 0; - - static_for<0, 4, 1>{}([&](auto i) { - acc += type_convert(a_vector.AsType()[i]) * - type_convert(b_vector.AsType()[i]); - }); - - return acc; - } - - __device__ T operator()(int8x8_t a, int8x8_t b) const - { - const vector_type a_vector{a}; - const vector_type b_vector{b}; - - T acc = 0; - - static_for<0, 8, 1>{}([&](auto i) { - acc += type_convert(a_vector.AsType()[i]) * - type_convert(b_vector.AsType()[i]); - }); - - return acc; - } - - __device__ T operator()(int8x16_t a, int8x16_t b) const - { - const vector_type a_vector{a}; - const vector_type b_vector{b}; - - T acc = 0; - - static_for<0, 16, 1>{}([&](auto i) { - acc += type_convert(a_vector.AsType()[i]) * - type_convert(b_vector.AsType()[i]); - }); - - return acc; - } -}; - template struct NumericLimits { diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 3c8e5010a2..c00982dfff 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -1,11 +1,16 @@ #pragma once -#include "amd_buffer_addressing.hpp" -#include "c_style_pointer_cast.hpp" #include "config.hpp" #include "enable_if.hpp" +#include "c_style_pointer_cast.hpp" +#include "amd_buffer_addressing.hpp" +#include "generic_memory_space_atomic_add.hpp" namespace ck { +// T may be scalar or vector +// X may be scalar or vector +// T and X have same scalar type +// X contains multiple T template (&p_data_[i]), x); + atomic_add(c_style_pointer_cast(&p_data_[i]), x); } } } diff --git a/include/ck/utility/generic_memory_space_atomic_add.hpp b/include/ck/utility/generic_memory_space_atomic_add.hpp new file mode 100644 index 0000000000..8ee2081776 --- /dev/null +++ b/include/ck/utility/generic_memory_space_atomic_add.hpp @@ -0,0 +1,44 @@ +#pragma once +#include "data_type.hpp" + +namespace ck { + +template +__device__ X atomic_add(X* p_dst, const X& x); + +template <> +__device__ int32_t atomic_add(int32_t* p_dst, const int32_t& x) +{ + return atomicAdd(p_dst, x); +} + +template <> +__device__ uint32_t atomic_add(uint32_t* p_dst, const uint32_t& x) +{ + return atomicAdd(p_dst, x); +} + +template <> +__device__ float atomic_add(float* p_dst, const float& x) +{ + return atomicAdd(p_dst, x); +} + +template <> +__device__ float2_t atomic_add(float2_t* p_dst, const float2_t& x) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + const vector_type vx{x}; + vector_type vy{0}; + + vy.template AsType()(I0) = + atomicAdd(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); + vy.template AsType()(I1) = + atomicAdd(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); + + return vy.template AsType()[I0]; +} + +} // namespace ck diff --git a/script/cmake-rocm.sh b/script/cmake-rocm.sh index 5ba8820651..86b6236896 100755 --- a/script/cmake-rocm.sh +++ b/script/cmake-rocm.sh @@ -10,7 +10,7 @@ cmake -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D BUILD_DEV=OFF \ -D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \ +-D CMAKE_CXX_FLAGS=" -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \