make innner product compatiable on gfx900

2026-05-12 17:26:00 +00:00 · 2021-08-11 09:42:53 -05:00
parent 4e57b30a6a
commit f2ac7832c6
8 changed files with 234 additions and 215 deletions
--- a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp
@@ -97,8 +97,7 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
-                            amd_inner_product_dlop<FloatA, FloatB, FloatC>(
+                            inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
                                a_buf[Number<a_offset>{}],
                                                                  b_buf[Number<b_offset>{}],
                                                                  c_buf(Number<c_offset>{}));
                        });
@@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
-                            amd_inner_product_dlop<a_vector_t, b_vector_t, FloatC>(
+                            inner_product<a_vector_t, b_vector_t, FloatC>(
                                a_vec.template AsType<a_vector_t>()[I0],
                                b_vec.template AsType<b_vector_t>()[I0],
                                c_buf(Number<c_offset>{}));
--- a/composable_kernel/include/utility/amd_dlop.hpp
+++ b/composable_kernel/include/utility/amd_dlop.hpp
@@ -1,188 +0,0 @@
 #ifndef CK_AMD_DLOP_HPP
 #define CK_AMD_DLOP_HPP
 #include "data_type.hpp"
 namespace ck {
 template <typename TA, typename TB, typename TC>
 __device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c);
 template <>
 __device__ void
 amd_inner_product_dlop<float, float, float>(const float& a, const float& b, float& c)
 {
 #if CK_USE_AMD_DLOP_INLINE_ASM
    asm volatile("\n \
            v_fmac_f32 %0, %1, %2 \n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
 #else
    c += a * b;
 #endif
 }
 template <>
 __device__ void
 amd_inner_product_dlop<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I0],
                           vector_type<float, 2>{b}.AsType<float>()[I0],
                           c);
    amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I1],
                           vector_type<float, 2>{b}.AsType<float>()[I1],
                           c);
 }
 template <>
 __device__ void
 amd_inner_product_dlop<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I0],
                           vector_type<float, 4>{b}.AsType<float>()[I0],
                           c);
    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I1],
                           vector_type<float, 4>{b}.AsType<float>()[I1],
                           c);
    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I2],
                           vector_type<float, 4>{b}.AsType<float>()[I2],
                           c);
    amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I3],
                           vector_type<float, 4>{b}.AsType<float>()[I3],
                           c);
 }
 #if CK_USE_AMD_DLOP
 template <>
 __device__ void
 amd_inner_product_dlop<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
 {
 #if CK_USE_AMD_DLOP_INLINE_ASM
    asm volatile("\n \
            v_dot2_f32_f16 %0, %1, %2, %0\n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
 #else
    c = __builtin_amdgcn_sdot2(a, b, c, false);
 #endif
 }
 template <>
 __device__ void
 amd_inner_product_dlop<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
                           c);
    amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
                           vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
                           c);
 }
 template <>
 __device__ void
 amd_inner_product_dlop<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
                           c);
    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
                           c);
    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
                           c);
    amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
                           vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
                           c);
 }
 template <>
 __device__ void amd_inner_product_dlop<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a,
                                                                    const int8x4_t& b,
                                                                    int32_t& c)
 {
 #if CK_USE_AMD_DLOP_INLINE_ASM
    asm volatile("\n \
            v_dot4_i32_i8 %0, %1, %2, %0\n \
            "
                 : "=v"(c)
                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
 #else
    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
 #endif
 }
 template <>
 __device__ void amd_inner_product_dlop<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a,
                                                                    const int8x8_t& b,
                                                                    int32_t& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
                           c);
    amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
                           vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
                           c);
 }
 template <>
 __device__ void amd_inner_product_dlop<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a,
                                                                      const int8x16_t& b,
                                                                      int32_t& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
                           c);
    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
                           c);
    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
                           c);
    amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
                           vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
                           c);
 }
 #endif // CK_USE_AMD_DLOP
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -4,6 +4,8 @@
 #include "data_type.hpp"
 #include "c_style_pointer_cast.hpp"
 // TODO: deprecate all amd_assembly_outer_product_xxx
 namespace ck {
 // c0 += inner_product(a, b0)
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -31,15 +31,13 @@
 #include "static_buffer.hpp"
 #include "dynamic_buffer.hpp"
 #include "inner_product.hpp"
 // TODO: remove this
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
 #endif
 #if CK_USE_AMD_DLOP
 #include "amd_dlop.hpp"
 #endif
 #if CK_USE_AMD_XDLOPS
 #include "amd_xdlops.hpp"
 #endif
--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -14,12 +14,7 @@
 // should enable one and only one GPU target
 #if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
      defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
-#error Need to define a single GPU target
+#error Need to define (only) one GPU target
 #endif
 // HIP version
 #ifndef CK_HIP_VERSION_FLAT
 #define CK_HIP_VERSION_FLAT 0
 #endif
 // launch bounds
@@ -38,6 +33,16 @@
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #endif
 // FMA instruction
 #if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
 #define CK_USE_AMD_V_MAC_F32
 #elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \
    defined(CK_AMD_GPU_GFX1030)
 #define CK_USE_AMD_V_FMAC_F32
 #define CK_USE_AMD_V_DOT2_F32_F16
 #define CK_USE_AMD_V_DOT4_I32_I8
 #endif
 // multi index
 #define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
@@ -46,13 +51,9 @@
 #define CK_USE_AMD_INLINE_ASM 1
 #endif
-// AMD DLOPS
+// AMD inner product (DLOP)
-#ifndef CK_USE_AMD_DLOP
+#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
-#define CK_USE_AMD_DLOP 1
+#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
 #endif
 #ifndef CK_USE_AMD_DLOP_INLINE_ASM
 #define CK_USE_AMD_DLOP_INLINE_ASM 1
 #endif
 // AMD buffer addressing
--- a/composable_kernel/include/utility/inner_product.hpp
+++ b/composable_kernel/include/utility/inner_product.hpp
@@ -0,0 +1,207 @@
 #ifndef CK_INNER_PRODUCT_HPP
 #define CK_INNER_PRODUCT_HPP
 #include "data_type.hpp"
 namespace ck {
 template <typename TA, typename TB, typename TC>
 __device__ void inner_product(const TA& a, const TB& b, TC& c);
 template <>
 __device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
 {
 #if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
    asm volatile("\n \
            v_mac_f32 %0, %1, %2 \n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
 #elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
    asm volatile("\n \
            v_fmac_f32 %0, %1, %2 \n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
 #else
    c += a * b;
 #endif
 }
 template <>
 __device__ void
 inner_product<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    inner_product(vector_type<float, 2>{a}.AsType<float>()[I0],
                  vector_type<float, 2>{b}.AsType<float>()[I0],
                  c);
    inner_product(vector_type<float, 2>{a}.AsType<float>()[I1],
                  vector_type<float, 2>{b}.AsType<float>()[I1],
                  c);
 }
 template <>
 __device__ void
 inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
    inner_product(vector_type<float, 4>{a}.AsType<float>()[I0],
                  vector_type<float, 4>{b}.AsType<float>()[I0],
                  c);
    inner_product(vector_type<float, 4>{a}.AsType<float>()[I1],
                  vector_type<float, 4>{b}.AsType<float>()[I1],
                  c);
    inner_product(vector_type<float, 4>{a}.AsType<float>()[I2],
                  vector_type<float, 4>{b}.AsType<float>()[I2],
                  c);
    inner_product(vector_type<float, 4>{a}.AsType<float>()[I3],
                  vector_type<float, 4>{b}.AsType<float>()[I3],
                  c);
 }
 template <>
 __device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
 {
 #if defined(CK_USE_AMD_V_DOT2_F32_F16)
 #if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
    asm volatile("\n \
            v_dot2_f32_f16 %0, %1, %2, %0\n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
 #else
    c = __builtin_amdgcn_sdot2(a, b, c, false);
 #endif
 #else
    const auto convert = type_convert<int32_t>{};
    const vector_type<half_t, 2> a_vector{a};
    const vector_type<half_t, 2> b_vector{b};
    static_for<0, 2, 1>{}([&](auto i) {
        c += convert(a_vector.AsType<half_t>()[i]) * convert(b_vector.AsType<half_t>()[i]);
    });
 #endif
 }
 template <>
 __device__ void inner_product<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
                  c);
    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
                  c);
 }
 template <>
 __device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
                  c);
    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
                  c);
    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
                  c);
    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
                  c);
 }
 template <>
 __device__ void
 inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
 {
 #if defined(CK_USE_DOT4_I32_I8)
 #if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
    asm volatile("\n \
            v_dot4_i32_i8 %0, %1, %2, %0\n \
            "
                 : "=v"(c)
                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
 #else
    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
 #endif
 #else
    const auto convert = type_convert<int32_t>{};
    const vector_type<int8_t, 4> a_vector{a};
    const vector_type<int8_t, 4> b_vector{b};
    static_for<0, 4, 1>{}([&](auto i) {
        c += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
    });
 #endif
 }
 template <>
 __device__ void
 inner_product<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a, const int8x8_t& b, int32_t& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
                  c);
    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
                  c);
 }
 template <>
 __device__ void
 inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t& b, int32_t& c)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
                  c);
    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
                  c);
    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
                  c);
    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
                  c);
 }
 } // namespace ck
 #endif
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
    const auto wei_desc_k_c_y_x   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
    const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
-#if 0
+#if 1
    // [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
    // cdata = 64, BlockSize = 256
    constexpr index_t BlockSize = 256;
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -20,9 +20,9 @@
 #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
 #define USE_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 0
+#define USE_CONV_FWD_V4R4_NCHW 1
-#define USE_CONV_FWD_V4R4R2_NHWC 1
+#define USE_CONV_FWD_V4R4R2_NHWC 0
-#define USE_CONV_FWD_V6R1_NCHW 1
+#define USE_CONV_FWD_V6R1_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
 #endif
-#if 0
+#if 1
    using in_data_t  = float;
    using acc_data_t = float;
    using out_data_t = float;