refactor

[ROCm/composable_kernel commit: 0b8e67ef08]
2026-05-14 10:09:41 +00:00 · 2019-01-08 14:05:03 -06:00
parent 81ffcfcd88
commit 220f5aea05
7 changed files with 81 additions and 89 deletions
--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -4,7 +4,7 @@
 #include "threadwise_direct_convolution.cuh"

 template <unsigned BlockSize,
-          class TFloat,
+          class Float,
          class InBlockDesc,
          class WeiBlockDesc,
          class OutBlockDesc,
@@ -14,11 +14,11 @@ template <unsigned BlockSize,
          unsigned KPerThread,
          unsigned CPerThread>
 __device__ void blockwise_direct_convolution(InBlockDesc,
-                                             TFloat* const __restrict__ p_in_block,
+                                             Float* const __restrict__ p_in_block,
                                             WeiBlockDesc,
-                                             TFloat* const __restrict__ p_wei_block,
+                                             Float* const __restrict__ p_wei_block,
                                             OutBlockDesc,
-                                             TFloat* __restrict__ p_out_block)
+                                             Float* __restrict__ p_out_block)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -92,7 +92,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
        unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding

-        TFloat p_out_thread[out_thread_desc.GetElementSpace()];
+        Float p_out_thread[out_thread_desc.GetElementSpace()];

        threadwise_4d_tensor_copy(out_block_desc,
                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
--- a/src/include/blockwise_tensor_op.cuh
+++ b/src/include/blockwise_tensor_op.cuh
@@ -1,9 +1,9 @@
 #pragma once
 #include "constant_tensor_descriptor.cuh"

-template <unsigned BlockSize, class TFloat, class DstDesc, class F>
+template <unsigned BlockSize, class Float, class DstDesc, class F>
 __device__ void
-blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
+blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -79,7 +79,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
 // TODO: in order to optimize mem access for different mem type,
 // need to write specialized version
 template <unsigned BlockSize,
-          class TFloat,
+          class Float,
          class SrcDesc,
          class DstDesc,
          class RefDesc,
@@ -87,9 +87,9 @@ template <unsigned BlockSize,
          class F>
 __device__ void
 blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
-                                                       TFloat* const __restrict__ p_src,
+                                                       Float* const __restrict__ p_src,
                                                       DstDesc,
-                                                       TFloat* __restrict__ p_dst,
+                                                       Float* __restrict__ p_dst,
                                                       RefDesc,
                                                       Reorder,
                                                       F f)
@@ -170,36 +170,32 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
    }
 }

-template <unsigned BlockSize, class TFloat, class DstDesc>
-__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
+template <unsigned BlockSize, class Float, class DstDesc>
+__device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
 {
-    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
+    auto f_set_zero = [](Float& v) { v = Float(0); };

    blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
 }

 template <unsigned BlockSize,
-          class TFloat,
+          class Float,
          class SrcDesc,
          class DstDesc,
          class RefDesc,
          class Reorder>
-__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
-                                                 TFloat* const __restrict__ p_src,
-                                                 DstDesc,
-                                                 TFloat* __restrict__ p_dst,
-                                                 RefDesc,
-                                                 Reorder)
+__device__ void blockwise_4d_tensor_copy_reorder(
+    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
 {
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
+    auto f_copy = [](const Float& src, Float& dst) { dst = src; };

    blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
 }

-template <unsigned BlockSize, class TFloat, class SrcDesc, class DstDesc, class RefDesc>
+template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class RefDesc>
 __device__ void blockwise_4d_tensor_copy(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
+    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
 {
    constexpr auto reorder = Sequence<0, 1, 2, 3>{};

--- a/src/include/gridwise_direct_convolution_1.cuh
+++ b/src/include/gridwise_direct_convolution_1.cuh
@@ -3,7 +3,7 @@
 #include "blockwise_tensor_op.cuh"
 #include "blockwise_direct_convolution.cuh"

-template <class TFloat,
+template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
@@ -20,11 +20,11 @@ template <class TFloat,
          unsigned BlockSize,
          unsigned GridSize>
 __global__ void gridwise_direct_convolution_1(InGlobalDesc,
-                                              TFloat* const __restrict__ p_in_global,
+                                              Float* const __restrict__ p_in_global,
                                              WeiGlobalDesc,
-                                              TFloat* const __restrict__ p_wei_global,
+                                              Float* const __restrict__ p_wei_global,
                                              OutGlobalDesc,
-                                              TFloat* __restrict__ p_out_global)
+                                              Float* __restrict__ p_out_global)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -68,9 +68,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
    constexpr unsigned out_block_size = out_block_desc.GetElementSpace();

-    __shared__ TFloat p_in_block[in_block_size];
-    __shared__ TFloat p_wei_block[wei_block_size];
-    __shared__ TFloat p_out_block[out_block_size];
+    __shared__ Float p_in_block[in_block_size];
+    __shared__ Float p_wei_block[wei_block_size];
+    __shared__ Float p_out_block[out_block_size];

    const unsigned block_id = blockIdx.x;

@@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,

        // blockwise convolution
        blockwise_direct_convolution<BlockSize,
-                                     TFloat,
+                                     Float,
                                     decltype(in_block_desc),
                                     decltype(wei_block_desc),
                                     decltype(out_block_desc),
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -5,7 +5,7 @@
 #include "threadwise_tensor_op.cuh"
 #include "threadwise_direct_convolution.cuh"

-template <class TFloat,
+template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
@@ -22,11 +22,11 @@ template <class TFloat,
          unsigned BlockSize,
          unsigned GridSize>
 __global__ void gridwise_direct_convolution_2(InGlobalDesc,
-                                              TFloat* const __restrict__ p_in_global,
+                                              Float* const __restrict__ p_in_global,
                                              WeiGlobalDesc,
-                                              TFloat* const __restrict__ p_wei_global,
+                                              Float* const __restrict__ p_wei_global,
                                              OutGlobalDesc,
-                                              TFloat* __restrict__ p_out_global)
+                                              Float* __restrict__ p_out_global)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -56,8 +56,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();

-    __shared__ TFloat p_in_block[in_block_size];
-    __shared__ TFloat p_wei_block[wei_block_size];
+    __shared__ Float p_in_block[in_block_size];
+    __shared__ Float p_wei_block[wei_block_size];

    // threadwise tensors
    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
@@ -73,7 +73,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
        get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);

    // register
-    TFloat p_out_thread[out_thread_desc.GetElementSpace()];
+    Float p_out_thread[out_thread_desc.GetElementSpace()];

    // divide block work
    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
--- a/src/include/gridwise_winograd_convolution.cuh
+++ b/src/include/gridwise_winograd_convolution.cuh
@@ -3,7 +3,7 @@
 #include "blockwise_winograd_transform.cuh"
 #include "threadwise_winograd_transform.cuh"

-template <class TFloat,
+template <class Float,
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
@@ -20,11 +20,11 @@ template <class TFloat,
          unsigned BlockSize,
          unsigned GridSize>
 __global__ void gridwise_winograd_convolution(InGlobalDesc,
-                                              TFloat* const __restrict__ p_in_global,
+                                              Float* const __restrict__ p_in_global,
                                              WeiGlobalDesc,
-                                              TFloat* const __restrict__ p_wei_global,
+                                              Float* const __restrict__ p_wei_global,
                                              OutGlobalDesc,
-                                              TFloat* __restrict__ p_out_global)
+                                              Float* __restrict__ p_out_global)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -102,8 +102,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
    constexpr auto wei_transform_block_desc =
        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, InTileSizeH, InTileSizeW>{});

-    __shared__ TFloat p_in_transform_block[in_transform_block_desc.GetElementSpace()];
-    __shared__ TFloat p_wei_transform_block[wei_transform_block_desc.GetElementSpace()];
+    __shared__ Float p_in_transform_block[in_transform_block_desc.GetElementSpace()];
+    __shared__ Float p_wei_transform_block[wei_transform_block_desc.GetElementSpace()];

    // thread data
    constexpr auto in_transform_thread_block_desc =
@@ -123,8 +123,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
    constexpr auto out_thread_global_desc =
        make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides());

-    TFloat p_out_transform_thread[out_transform_thread_desc.GetElementSpace()];
-    TFloat p_out_thread[out_thread_desc.GetElementSpace()];
+    Float p_out_transform_thread[out_transform_thread_desc.GetElementSpace()];
+    Float p_out_thread[out_thread_desc.GetElementSpace()];

 #if 0
    if(blockIdx.x == 0 && threadIdx.x == 0)
@@ -146,7 +146,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
    {
 #if 0
        // blockwise transform input
-        blockwise_winograd_transform_input<TFloat,
+        blockwise_winograd_transform_input<Float,
                                           InTileSizeH,
                                           InTileSizeW,
                                           S,
@@ -166,7 +166,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,

 #endif
        // blockwise transform weights
-        blockwise_winograd_transform_weight<TFloat,
+        blockwise_winograd_transform_weight<Float,
                                            InTileSizeH,
                                            InTileSizeW,
                                            S,
@@ -183,7 +183,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
        {
            // threadwise point multiplication
            threadwise_winograd_calculate_transformed_output<
-                TFloat,
+                Float,
                decltype(in_transform_thread_block_desc),
                decltype(wei_transform_thread_block_desc),
                decltype(out_transform_thread_desc),
@@ -207,7 +207,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc,
    };

    // transform back
-    threadwise_winograd_reverse_transform_output<TFloat,
+    threadwise_winograd_reverse_transform_output<Float,
                                                 decltype(out_transform_thread_desc),
                                                 decltype(out_thread_desc),
                                                 InTileSizeH,
--- a/src/include/threadwise_direct_convolution.cuh
+++ b/src/include/threadwise_direct_convolution.cuh
@@ -2,13 +2,13 @@
 #include "constant_tensor_descriptor.cuh"

 // optimized for scenario if p_in, p_wei, p_out are in register
-template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+template <class Float, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_1(InDesc,
-                                                TFloat* const __restrict__ p_in,
+                                                Float* const __restrict__ p_in,
                                                WeiDesc,
-                                                TFloat* const __restrict__ p_wei,
+                                                Float* const __restrict__ p_wei,
                                                OutDesc,
-                                                TFloat* __restrict__ p_out)
+                                                Float* __restrict__ p_out)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,

 // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
 // Copy in and wei into register before doing convolution
-template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+template <class Float, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_2(InDesc,
-                                                TFloat* const __restrict__ p_in,
+                                                Float* const __restrict__ p_in,
                                                WeiDesc,
-                                                TFloat* const __restrict__ p_wei,
+                                                Float* const __restrict__ p_wei,
                                                OutDesc,
-                                                TFloat* __restrict__ p_out)
+                                                Float* __restrict__ p_out)
 {
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());

    // register
-    TFloat p_in_reg[in_reg_desc.GetElementSpace()];
-    TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
+    Float p_in_reg[in_reg_desc.GetElementSpace()];
+    Float p_wei_reg[wei_reg_desc.GetElementSpace()];

    // copy input tensor into register
    threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
 // optimized for scenario where p_in and p_wei are in LDS, p_out is in register
 // break down a non-1x1 convolution into a sequence of 1x1 convolutions,
 // load 1x1 weight into register, and do 1x1 convolution in register.
-template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+template <class Float, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_3(InDesc,
-                                                TFloat* const __restrict__ p_in,
+                                                Float* const __restrict__ p_in,
                                                WeiDesc,
-                                                TFloat* const __restrict__ p_wei,
+                                                Float* const __restrict__ p_wei,
                                                OutDesc,
-                                                TFloat* __restrict__ p_out)
+                                                Float* __restrict__ p_out)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
        Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});

-    TFloat p_in_reg[in_reg_desc.GetElementSpace()];
-    TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
+    Float p_in_reg[in_reg_desc.GetElementSpace()];
+    Float p_wei_reg[wei_reg_desc.GetElementSpace()];

    constexpr unsigned in_w_new_read = 1;

--- a/src/include/threadwise_tensor_op.cuh
+++ b/src/include/threadwise_tensor_op.cuh
@@ -1,8 +1,8 @@
 #pragma once
 #include "constant_tensor_descriptor.cuh"

-template <class TFloat, class Desc, class F>
-__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __restrict__ p, F f)
+template <class Float, class Desc, class F>
+__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __restrict__ p, F f)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -37,12 +37,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __r

 // TODO: in order to optimize mem access for different mem type,
 // need to write specialized version
-template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, class Reorder, class F>
+template <class Float, class SrcDesc, class DstDesc, class RefDesc, class Reorder, class F>
 __device__ void
 threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
-                                                        TFloat* const __restrict__ p_src,
+                                                        Float* const __restrict__ p_src,
                                                        DstDesc,
-                                                        TFloat* __restrict__ p_dst,
+                                                        Float* __restrict__ p_dst,
                                                        RefDesc,
                                                        Reorder,
                                                        F f)
@@ -83,26 +83,22 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
    }
 }

-template <class TFloat, class Desc>
-__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p)
+template <class Float, class Desc>
+__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
 {
-    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
+    auto f_set_zero = [](Float& v) { v = Float(0); };

-    threadwise_4d_tensor_pointwise_operation_unary<TFloat, Desc, decltype(f_set_zero)>(
+    threadwise_4d_tensor_pointwise_operation_unary<Float, Desc, decltype(f_set_zero)>(
        Desc{}, p, f_set_zero);
 }

-template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, class Reorder>
-__device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
-                                                  TFloat* const __restrict__ p_src,
-                                                  DstDesc,
-                                                  TFloat* __restrict__ p_dst,
-                                                  RefDesc,
-                                                  Reorder)
+template <class Float, class SrcDesc, class DstDesc, class RefDesc, class Reorder>
+__device__ void threadwise_4d_tensor_copy_reorder(
+    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
 {
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
+    auto f_copy = [](const Float& src, Float& dst) { dst = src; };

-    threadwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
+    threadwise_4d_tensor_pointwise_operation_binary_reorder<Float,
                                                            SrcDesc,
                                                            DstDesc,
                                                            RefDesc,
@@ -111,18 +107,18 @@ __device__ void threadwise_4d_tensor_copy_reorder(SrcDesc,
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
 }

-template <class TFloat, class SrcDesc, class DstDesc, class RefDesc>
+template <class Float, class SrcDesc, class DstDesc, class RefDesc>
 __device__ void threadwise_4d_tensor_copy(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
+    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
 {
    auto reorder = Sequence<0, 1, 2, 3>{};

-    threadwise_4d_tensor_copy_reorder<TFloat, SrcDesc, DstDesc, RefDesc, decltype(reorder)>(
+    threadwise_4d_tensor_copy_reorder<Float, SrcDesc, DstDesc, RefDesc, decltype(reorder)>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
 }

-template <class TFloat, class Desc, class IDim, class NShift>
-__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, NShift)
+template <class Float, class Desc, class IDim, class NShift>
+__device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};