From f2ac7832c65969f5b3ecf7972518d55ee099c03b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 11 Aug 2021 09:42:53 -0500 Subject: [PATCH] make innner product compatiable on gfx900 --- .../threadwise_contraction_dlops.hpp | 9 +- .../include/utility/amd_dlop.hpp | 188 ---------------- .../include/utility/amd_inline_asm.hpp | 2 + .../include/utility/common_header.hpp | 6 +- composable_kernel/include/utility/config.hpp | 27 +-- .../include/utility/inner_product.hpp | 207 ++++++++++++++++++ ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp | 2 +- .../src/conv_fwd_driver_offline.cpp | 8 +- 8 files changed, 234 insertions(+), 215 deletions(-) delete mode 100644 composable_kernel/include/utility/amd_dlop.hpp create mode 100644 composable_kernel/include/utility/inner_product.hpp diff --git a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp index 7e7bb9c8c3..ca3aca3015 100644 --- a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp @@ -97,10 +97,9 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1 CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset( c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1)); - amd_inner_product_dlop( - a_buf[Number{}], - b_buf[Number{}], - c_buf(Number{})); + inner_product(a_buf[Number{}], + b_buf[Number{}], + c_buf(Number{})); }); }); }); @@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_ CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset( c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1)); - amd_inner_product_dlop( + inner_product( a_vec.template AsType()[I0], b_vec.template AsType()[I0], c_buf(Number{})); diff --git a/composable_kernel/include/utility/amd_dlop.hpp b/composable_kernel/include/utility/amd_dlop.hpp deleted file mode 100644 index 8ce19012e9..0000000000 --- a/composable_kernel/include/utility/amd_dlop.hpp +++ /dev/null @@ -1,188 +0,0 @@ -#ifndef CK_AMD_DLOP_HPP -#define CK_AMD_DLOP_HPP - -#include "data_type.hpp" - -namespace ck { - -template -__device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c); - -template <> -__device__ void -amd_inner_product_dlop(const float& a, const float& b, float& c) -{ -#if CK_USE_AMD_DLOP_INLINE_ASM - asm volatile("\n \ - v_fmac_f32 %0, %1, %2 \n \ - " - : "=v"(c) - : "v"(a), "v"(b), "0"(c)); -#else - c += a * b; -#endif -} - -template <> -__device__ void -amd_inner_product_dlop(const float2_t& a, const float2_t& b, float& c) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - amd_inner_product_dlop(vector_type{a}.AsType()[I0], - vector_type{b}.AsType()[I0], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I1], - vector_type{b}.AsType()[I1], - c); -} - -template <> -__device__ void -amd_inner_product_dlop(const float4_t& a, const float4_t& b, float& c) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - amd_inner_product_dlop(vector_type{a}.AsType()[I0], - vector_type{b}.AsType()[I0], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I1], - vector_type{b}.AsType()[I1], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I2], - vector_type{b}.AsType()[I2], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I3], - vector_type{b}.AsType()[I3], - c); -} - -#if CK_USE_AMD_DLOP -template <> -__device__ void -amd_inner_product_dlop(const half2_t& a, const half2_t& b, float& c) -{ -#if CK_USE_AMD_DLOP_INLINE_ASM - asm volatile("\n \ - v_dot2_f32_f16 %0, %1, %2, %0\n \ - " - : "=v"(c) - : "v"(a), "v"(b), "0"(c)); -#else - c = __builtin_amdgcn_sdot2(a, b, c, false); -#endif -} - -template <> -__device__ void -amd_inner_product_dlop(const half4_t& a, const half4_t& b, float& c) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - amd_inner_product_dlop(vector_type{a}.AsType()[I0], - vector_type{b}.AsType()[I0], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I1], - vector_type{b}.AsType()[I1], - c); -} - -template <> -__device__ void -amd_inner_product_dlop(const half8_t& a, const half8_t& b, float& c) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - amd_inner_product_dlop(vector_type{a}.AsType()[I0], - vector_type{b}.AsType()[I0], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I1], - vector_type{b}.AsType()[I1], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I2], - vector_type{b}.AsType()[I2], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I3], - vector_type{b}.AsType()[I3], - c); -} - -template <> -__device__ void amd_inner_product_dlop(const int8x4_t& a, - const int8x4_t& b, - int32_t& c) -{ -#if CK_USE_AMD_DLOP_INLINE_ASM - asm volatile("\n \ - v_dot4_i32_i8 %0, %1, %2, %0\n \ - " - : "=v"(c) - : "v"(as_type(a)), "v"(as_type(b)), "0"(c)); -#else - c = __builtin_amdgcn_sdot4(as_type(a), as_type(b), c, false); -#endif -} - -template <> -__device__ void amd_inner_product_dlop(const int8x8_t& a, - const int8x8_t& b, - int32_t& c) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - amd_inner_product_dlop(vector_type{a}.AsType()[I0], - vector_type{b}.AsType()[I0], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I1], - vector_type{b}.AsType()[I1], - c); -} - -template <> -__device__ void amd_inner_product_dlop(const int8x16_t& a, - const int8x16_t& b, - int32_t& c) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - amd_inner_product_dlop(vector_type{a}.AsType()[I0], - vector_type{b}.AsType()[I0], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I1], - vector_type{b}.AsType()[I1], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I2], - vector_type{b}.AsType()[I2], - c); - - amd_inner_product_dlop(vector_type{a}.AsType()[I3], - vector_type{b}.AsType()[I3], - c); -} -#endif // CK_USE_AMD_DLOP - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 34c68a35aa..a2d9d5f062 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -4,6 +4,8 @@ #include "data_type.hpp" #include "c_style_pointer_cast.hpp" +// TODO: deprecate all amd_assembly_outer_product_xxx + namespace ck { // c0 += inner_product(a, b0) diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 45d22cd618..ba20248028 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -31,15 +31,13 @@ #include "static_buffer.hpp" #include "dynamic_buffer.hpp" +#include "inner_product.hpp" + // TODO: remove this #if CK_USE_AMD_INLINE_ASM #include "amd_inline_asm.hpp" #endif -#if CK_USE_AMD_DLOP -#include "amd_dlop.hpp" -#endif - #if CK_USE_AMD_XDLOPS #include "amd_xdlops.hpp" #endif diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp index 49f1bb7a5a..521ad24d47 100644 --- a/composable_kernel/include/utility/config.hpp +++ b/composable_kernel/include/utility/config.hpp @@ -14,12 +14,7 @@ // should enable one and only one GPU target #if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030)) -#error Need to define a single GPU target -#endif - -// HIP version -#ifndef CK_HIP_VERSION_FLAT -#define CK_HIP_VERSION_FLAT 0 +#error Need to define (only) one GPU target #endif // launch bounds @@ -38,6 +33,16 @@ #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 #endif +// FMA instruction +#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) +#define CK_USE_AMD_V_MAC_F32 +#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \ + defined(CK_AMD_GPU_GFX1030) +#define CK_USE_AMD_V_FMAC_F32 +#define CK_USE_AMD_V_DOT2_F32_F16 +#define CK_USE_AMD_V_DOT4_I32_I8 +#endif + // multi index #define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 @@ -46,13 +51,9 @@ #define CK_USE_AMD_INLINE_ASM 1 #endif -// AMD DLOPS -#ifndef CK_USE_AMD_DLOP -#define CK_USE_AMD_DLOP 1 -#endif - -#ifndef CK_USE_AMD_DLOP_INLINE_ASM -#define CK_USE_AMD_DLOP_INLINE_ASM 1 +// AMD inner product (DLOP) +#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM +#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 #endif // AMD buffer addressing diff --git a/composable_kernel/include/utility/inner_product.hpp b/composable_kernel/include/utility/inner_product.hpp new file mode 100644 index 0000000000..51753accf3 --- /dev/null +++ b/composable_kernel/include/utility/inner_product.hpp @@ -0,0 +1,207 @@ +#ifndef CK_INNER_PRODUCT_HPP +#define CK_INNER_PRODUCT_HPP + +#include "data_type.hpp" + +namespace ck { + +template +__device__ void inner_product(const TA& a, const TB& b, TC& c); + +template <> +__device__ void inner_product(const float& a, const float& b, float& c) +{ +#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32) + asm volatile("\n \ + v_mac_f32 %0, %1, %2 \n \ + " + : "=v"(c) + : "v"(a), "v"(b), "0"(c)); +#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32) + asm volatile("\n \ + v_fmac_f32 %0, %1, %2 \n \ + " + : "=v"(c) + : "v"(a), "v"(b), "0"(c)); +#else + c += a * b; +#endif +} + +template <> +__device__ void +inner_product(const float2_t& a, const float2_t& b, float& c) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + inner_product(vector_type{a}.AsType()[I0], + vector_type{b}.AsType()[I0], + c); + + inner_product(vector_type{a}.AsType()[I1], + vector_type{b}.AsType()[I1], + c); +} + +template <> +__device__ void +inner_product(const float4_t& a, const float4_t& b, float& c) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + inner_product(vector_type{a}.AsType()[I0], + vector_type{b}.AsType()[I0], + c); + + inner_product(vector_type{a}.AsType()[I1], + vector_type{b}.AsType()[I1], + c); + + inner_product(vector_type{a}.AsType()[I2], + vector_type{b}.AsType()[I2], + c); + + inner_product(vector_type{a}.AsType()[I3], + vector_type{b}.AsType()[I3], + c); +} + +template <> +__device__ void inner_product(const half2_t& a, const half2_t& b, float& c) +{ +#if defined(CK_USE_AMD_V_DOT2_F32_F16) +#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM + asm volatile("\n \ + v_dot2_f32_f16 %0, %1, %2, %0\n \ + " + : "=v"(c) + : "v"(a), "v"(b), "0"(c)); +#else + c = __builtin_amdgcn_sdot2(a, b, c, false); +#endif +#else + const auto convert = type_convert{}; + + const vector_type a_vector{a}; + const vector_type b_vector{b}; + + static_for<0, 2, 1>{}([&](auto i) { + c += convert(a_vector.AsType()[i]) * convert(b_vector.AsType()[i]); + }); +#endif +} + +template <> +__device__ void inner_product(const half4_t& a, const half4_t& b, float& c) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + inner_product(vector_type{a}.AsType()[I0], + vector_type{b}.AsType()[I0], + c); + + inner_product(vector_type{a}.AsType()[I1], + vector_type{b}.AsType()[I1], + c); +} + +template <> +__device__ void inner_product(const half8_t& a, const half8_t& b, float& c) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + inner_product(vector_type{a}.AsType()[I0], + vector_type{b}.AsType()[I0], + c); + + inner_product(vector_type{a}.AsType()[I1], + vector_type{b}.AsType()[I1], + c); + + inner_product(vector_type{a}.AsType()[I2], + vector_type{b}.AsType()[I2], + c); + + inner_product(vector_type{a}.AsType()[I3], + vector_type{b}.AsType()[I3], + c); +} + +template <> +__device__ void +inner_product(const int8x4_t& a, const int8x4_t& b, int32_t& c) +{ +#if defined(CK_USE_DOT4_I32_I8) +#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM + asm volatile("\n \ + v_dot4_i32_i8 %0, %1, %2, %0\n \ + " + : "=v"(c) + : "v"(as_type(a)), "v"(as_type(b)), "0"(c)); +#else + c = __builtin_amdgcn_sdot4(as_type(a), as_type(b), c, false); +#endif +#else + const auto convert = type_convert{}; + + const vector_type a_vector{a}; + const vector_type b_vector{b}; + + static_for<0, 4, 1>{}([&](auto i) { + c += convert(a_vector.AsType()[i]) * convert(b_vector.AsType()[i]); + }); +#endif +} + +template <> +__device__ void +inner_product(const int8x8_t& a, const int8x8_t& b, int32_t& c) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + inner_product(vector_type{a}.AsType()[I0], + vector_type{b}.AsType()[I0], + c); + + inner_product(vector_type{a}.AsType()[I1], + vector_type{b}.AsType()[I1], + c); +} + +template <> +__device__ void +inner_product(const int8x16_t& a, const int8x16_t& b, int32_t& c) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + inner_product(vector_type{a}.AsType()[I0], + vector_type{b}.AsType()[I0], + c); + + inner_product(vector_type{a}.AsType()[I1], + vector_type{b}.AsType()[I1], + c); + + inner_product(vector_type{a}.AsType()[I2], + vector_type{b}.AsType()[I2], + c); + + inner_product(vector_type{a}.AsType()[I3], + vector_type{b}.AsType()[I3], + c); +} + +} // namespace ck +#endif diff --git a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp index 0d28616386..e1b7c5486c 100644 --- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp @@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( const auto wei_desc_k_c_y_x = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths); const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths); -#if 0 +#if 1 // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32 // cdata = 64, BlockSize = 256 constexpr index_t BlockSize = 256; diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp index 2653929c32..161d17a4de 100644 --- a/host/driver_offline/src/conv_fwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp @@ -20,9 +20,9 @@ #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp" #define USE_MODE 1 -#define USE_CONV_FWD_V4R4_NCHW 0 -#define USE_CONV_FWD_V4R4R2_NHWC 1 -#define USE_CONV_FWD_V6R1_NCHW 1 +#define USE_CONV_FWD_V4R4_NCHW 1 +#define USE_CONV_FWD_V4R4R2_NHWC 0 +#define USE_CONV_FWD_V6R1_NCHW 0 #define USE_CONV_FWD_V5R1_NCHW 0 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0 @@ -126,7 +126,7 @@ int main(int argc, char* argv[]) const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; #endif -#if 0 +#if 1 using in_data_t = float; using acc_data_t = float; using out_data_t = float;