refactor

[ROCm/composable_kernel commit: 0b8e67ef08]
2026-05-17 03:19:48 +00:00 · 2019-01-08 14:05:03 -06:00
parent 81ffcfcd88
commit 220f5aea05
7 changed files with 81 additions and 89 deletions
--- a/src/include/threadwise_direct_convolution.cuh
+++ b/src/include/threadwise_direct_convolution.cuh
@@ -2,13 +2,13 @@
 #include "constant_tensor_descriptor.cuh"

 // optimized for scenario if p_in, p_wei, p_out are in register
-template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+template <class Float, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_1(InDesc,
-                                                TFloat* const __restrict__ p_in,
+                                                Float* const __restrict__ p_in,
                                                WeiDesc,
-                                                TFloat* const __restrict__ p_wei,
+                                                Float* const __restrict__ p_wei,
                                                OutDesc,
-                                                TFloat* __restrict__ p_out)
+                                                Float* __restrict__ p_out)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,

 // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
 // Copy in and wei into register before doing convolution
-template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+template <class Float, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_2(InDesc,
-                                                TFloat* const __restrict__ p_in,
+                                                Float* const __restrict__ p_in,
                                                WeiDesc,
-                                                TFloat* const __restrict__ p_wei,
+                                                Float* const __restrict__ p_wei,
                                                OutDesc,
-                                                TFloat* __restrict__ p_out)
+                                                Float* __restrict__ p_out)
 {
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());

    // register
-    TFloat p_in_reg[in_reg_desc.GetElementSpace()];
-    TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
+    Float p_in_reg[in_reg_desc.GetElementSpace()];
+    Float p_wei_reg[wei_reg_desc.GetElementSpace()];

    // copy input tensor into register
    threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
 // optimized for scenario where p_in and p_wei are in LDS, p_out is in register
 // break down a non-1x1 convolution into a sequence of 1x1 convolutions,
 // load 1x1 weight into register, and do 1x1 convolution in register.
-template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+template <class Float, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_3(InDesc,
-                                                TFloat* const __restrict__ p_in,
+                                                Float* const __restrict__ p_in,
                                                WeiDesc,
-                                                TFloat* const __restrict__ p_wei,
+                                                Float* const __restrict__ p_wei,
                                                OutDesc,
-                                                TFloat* __restrict__ p_out)
+                                                Float* __restrict__ p_out)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
        Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});

-    TFloat p_in_reg[in_reg_desc.GetElementSpace()];
-    TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
+    Float p_in_reg[in_reg_desc.GetElementSpace()];
+    Float p_wei_reg[wei_reg_desc.GetElementSpace()];

    constexpr unsigned in_w_new_read = 1;