adding int8 direct that reads pre-vectorized data

[ROCm/composable_kernel commit: 050a1a6890]
2026-05-16 19:09:59 +00:00 · 2019-03-19 00:05:41 -05:00
parent 14ad082d6c
commit 80f2ab2e09
6 changed files with 247 additions and 95 deletions
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -10,16 +10,6 @@ namespace CUDA {
 using half  = CUDA::half;
 using half2 = CUDA::half2;

-struct half4
-{
-    half data[4];
-};
-
-struct half8
-{
-    half data[8];
-};
-
 template <class T, unsigned N>
 struct vector_type
 {
@@ -119,39 +109,141 @@ struct vector_type<half2, 4>
    using MemoryType = float4;
 };

-template <class TDst, class TSrc0, class TSrc1, class TSrc2>
-__device__ void fused_multiply_add(TDst& d, TSrc0 s0, TSrc1 s1, TSrc2 s2)
+template <>
+struct vector_type<char, 1>
 {
+    using MemoryType = char;
+
+    __host__ __device__ static MemoryType Pack(char s) { return s; }
+};
+
+template <>
+struct vector_type<char, 2>
+{
+    using MemoryType = char2;
+
+    __host__ __device__ static MemoryType Pack(char s0, char s1)
+    {
+        union
+        {
+            MemoryType vector;
+            char scalar[2];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<char, 4>
+{
+    using MemoryType = char4;
+
+    __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
+    {
+        union
+        {
+            MemoryType vector;
+            char scalar[4];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<char, 8>
+{
+    using MemoryType = int64_t;
+};
+
+template <>
+struct vector_type<char2, 2>
+{
+    using MemoryType = char4;
+};
+
+template <>
+struct vector_type<char2, 4>
+{
+    using MemoryType = int64_t;
+};
+
+template <>
+struct vector_type<char4, 2>
+{
+    using MemoryType = int64_t;
+};
+
+template <class TDst, class TSrc0, class TSrc1>
+__device__ void fused_multiply_accumulate(TDst& d, const TSrc0& s0, const TSrc1& s1)
+{
+    // static_assert(false, "should not call into base");
    printf("should not call into base");
    assert(false);
 }

 template <>
-__device__ void fused_multiply_add(float& d, float s0, float s1, float s2)
+__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
-    d = s0 * s1 + s2;
+    d += s0 * s1;
 }

 template <>
-__device__ void fused_multiply_add(float& d, float2 s0, float2 s1, float s2)
+__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
 {
-    d = s0.x * s1.x + s0.y * s1.y + s2;
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
 }

 template <>
-__device__ void fused_multiply_add(float& d, float4 s0, float4 s1, float s2)
+__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
 {
-    d = s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w + s2;
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+    d += s0.z * s1.z;
+    d += s0.w * s1.w;
 }

 template <>
-__device__ void fused_multiply_add(half& d, half s0, half s1, half s2)
+__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1)
 {
-    d = s0 * s1 + s2;
+    d += s0 * s1;
 }

 template <>
-__device__ void fused_multiply_add(half& d, half2 s0, half2 s1, half s2)
+__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
 {
-    d = s0.x * s1.x + s0.y * s1.y + s2;
-}
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+}
+
+#if 0
+template <>
+__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x + s0.y * s1.y;
+}
+#endif
+
+template <>
+__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1)
+{
+    d += s0 * s1;
+}
+
+template <>
+__device__ void fused_multiply_accumulate(int32_t& d, const char4& s0, const char4& s1)
+{
+#if DEVICE_BACKEND_CUDA
+    d = __dp4a(s0, s1, d);
+#else
+    d += s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w;
+#endif
+}
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -7,7 +7,9 @@
 #include "threadwise_4d_tensor_op.hip.hpp"
 #include "threadwise_direct_convolution.hip.hpp"

-template <class Float,
+template <class TInWei,
+          class TOut,
+          class TAccum,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
@@ -27,14 +29,16 @@ template <class Float,
          unsigned BlockSize,
          unsigned GridSize>
 __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
-    const typename vector_type<Float,
+    const typename vector_type<TInWei,
                               ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
-    const typename vector_type<Float,
+    const typename vector_type<TInWei,
                               ScalarPerVector>::MemoryType* const __restrict__ p_wei_vec_global,
-    Float* const __restrict__ p_out_global)
+    TOut* const __restrict__ p_out_global)
 {
-    using scalar_t     = Float;
-    using vector_mem_t = typename vector_type<scalar_t, ScalarPerVector>::MemoryType;
+    using in_scalar_t     = TInWei;
+    using in_vector_mem_t = typename vector_type<in_scalar_t, ScalarPerVector>::MemoryType;
+    using out_scalar_t    = TOut;
+    using accum_t         = TAccum;

    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -79,9 +83,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                                       ? InBlockCopyDataPerRead
                                       : WeiBlockCopyDataPerRead;

-    __shared__ vector_mem_t
+    __shared__ in_vector_mem_t
        p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
-    __shared__ vector_mem_t
+    __shared__ in_vector_mem_t
        p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];

    // threadwise tensors
@@ -99,7 +103,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
        in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc);

    // register
-    scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
+    out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];

    // divide block work
    constexpr unsigned NBlockWork =
@@ -155,7 +159,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(

    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
-                               vector_mem_t,
+                               in_vector_mem_t,
                               decltype(in_nchw_vec_global_desc),
                               decltype(in_nchw_vec_block_desc),
                               decltype(in_nchw_vec_block_desc.GetLengths()),
@@ -164,7 +168,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #if 0
    constexpr auto blockwise_wei_copy =
        Blockwise4dTensorCopy1<BlockSize,
-                               vector_mem_t,
+                               in_vector_mem_t,
                               decltype(wei_kcyx_vec_global_desc),
                               decltype(wei_kcyx_vec_block_desc),
                               decltype(wei_kcyx_vec_block_desc.GetLengths()),
@@ -172,15 +176,17 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #elif 1
    const auto blockwise_wei_copy =
        Blockwise2dTensorCopy3<BlockSize,
-                               vector_mem_t,
+                               in_vector_mem_t,
                               decltype(wei_ke_vec_global_desc),
                               decltype(wei_ke_vec_block_desc),
                               decltype(wei_ke_vec_block_desc.GetLengths()),
                               WeiBlockCopyDataPerRead>{};
 #endif

+#if 1 // debug
    // set threadwise output tensor to 0
    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
+#endif

    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
        c_block_data_begin += CPerBlock, __syncthreads())
--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
@@ -37,7 +37,8 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re

 // TODO: in order to optimize mem access for different mem type,
 // need to write specialized version
-template <class Float,
+template <class SrcData,
+          class DstData,
          class SrcDesc,
          class DstDesc,
          class SrcOpLengths,
@@ -45,9 +46,9 @@ template <class Float,
          class F>
 __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
    SrcDesc,
-    const Float* __restrict__ p_src,
+    const SrcData* __restrict__ p_src,
    DstDesc,
-    Float* __restrict__ p_dst,
+    DstData* __restrict__ p_dst,
    SrcOpLengths,
    DstFromSrcReorder,
    F f)
@@ -88,33 +89,38 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_d
    }
 }

-template <class Float, class Desc>
-__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
+template <class Data, class Desc>
+__device__ void threadwise_4d_tensor_set_zero(Desc, Data* __restrict__ p)
 {
-    auto f_set_zero = [](Float& v) { v = Float(0); };
+    auto f_set_zero = [](Data& v) { v = Data(0); };

-    threadwise_4d_tensor_pointwise_operation_unary<Float, Desc, decltype(f_set_zero)>(
+    threadwise_4d_tensor_pointwise_operation_unary<Data, Desc, decltype(f_set_zero)>(
        Desc{}, p, f_set_zero);
 }

-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, class DstFromSrcReorder>
+template <class SrcData,
+          class DstData,
+          class SrcDesc,
+          class DstDesc,
+          class SrcOpLengths,
+          class DstFromSrcReorder>
 __device__ void
 threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
-                                                      const Float* __restrict__ p_src,
+                                                      const SrcData* __restrict__ p_src,
                                                      DstDesc,
-                                                      Float* __restrict__ p_dst,
+                                                      DstData* __restrict__ p_dst,
                                                      SrcOpLengths,
                                                      DstFromSrcReorder)
 {
-    auto f_copy = [](const Float& src, Float& dst) { dst = src; };
+    auto f_copy = [](const SrcData& src, DstData& dst) { dst = static_cast<DstData>(src); };

    threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
 }

-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
+template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
 __device__ void threadwise_4d_tensor_copy(
-    SrcDesc, const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths)
+    SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
 {
    auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};

--- a/src/include/threadwise_direct_convolution.hip.hpp
+++ b/src/include/threadwise_direct_convolution.hip.hpp
@@ -51,10 +51,8 @@ __device__ void threadwise_direct_convolution_1(InDesc,

                                const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);

-                                fused_multiply_add(p_out[out_index],
-                                                   p_wei[wei_index],
-                                                   p_in[in_index],
-                                                   p_out[out_index]);
+                                fused_multiply_accumulate(
+                                    p_out[out_index], p_wei[wei_index], p_in[in_index]);
                            }
                        }
                    }