update hip build

2026-05-12 01:10:17 +00:00 · 2019-03-12 17:20:11 -05:00
parent 0c88a3d891
commit 2c9b8c2432
12 changed files with 97 additions and 82 deletions
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -47,7 +47,7 @@ struct GeneratorTensor_3
        std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
        std::vector<std::size_t> lens(sizeof...(Is), 100);
        std::vector<std::size_t> strides(sizeof...(Is), 1);
-        std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1);
+        std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1);
        return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
 #endif
    }
@@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(
            std::size_t ho = HoPerTile * htile + j;
            for(int i = 0; i < WoPerTile; ++i)
            {
-                std::size_t wo    = WoPerTile * wtile + i;
+                std::size_t wo = WoPerTile * wtile + i;
                out(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
            }
        }
@@ -406,13 +406,13 @@ int main(int argc, char* argv[])
    constexpr unsigned WPad = 0;
 #elif 1
    // 3x3, 34x34
-    constexpr unsigned N = 64;
-    constexpr unsigned C = 256;
+    constexpr unsigned N  = 64;
+    constexpr unsigned C  = 256;
    constexpr unsigned HI = 34;
    constexpr unsigned WI = 34;
-    constexpr unsigned K = 64;
-    constexpr unsigned Y = 3;
-    constexpr unsigned X = 3;
+    constexpr unsigned K  = 64;
+    constexpr unsigned Y  = 3;
+    constexpr unsigned X  = 3;

    constexpr unsigned HPad = 0;
    constexpr unsigned WPad = 0;
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -245,10 +245,11 @@ struct BlockwiseChwnTensorCopyPadded
        constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;

        const Float* p_src_tmp =
-            p_src + src_desc.Get1dIndex(c_block_data_begin,
-                                        (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
-                                        (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
-                                        n_block_data_begin);
+            p_src +
+            src_desc.Get1dIndex(c_block_data_begin,
+                                (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
+                                (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
+                                n_block_data_begin);

 #if 0
        if(get_thread_local_1d_id() == 0)
--- a/src/include/blockwise_direct_convolution.hip.hpp
+++ b/src/include/blockwise_direct_convolution.hip.hpp
@@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        Float p_out_thread[out_thread_desc.GetElementSpace()];

        threadwise_4d_tensor_copy(out_block_desc,
-                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                          k_thread_data_begin,
-                                                                          ho_thread_data_begin,
-                                                                          wo_thread_data_begin),
+                                  p_out_block +
+                                      out_block_desc.Get1dIndex(n_thread_data_begin,
+                                                                k_thread_data_begin,
+                                                                ho_thread_data_begin,
+                                                                wo_thread_data_begin),
                                  out_thread_desc,
                                  p_out_thread,
                                  out_thread_desc.GetLengths());
@@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
            // threadwise convolution
            threadwise_direct_convolution_2(
                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data_begin,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                p_in_block +
+                    in_block_desc.Get1dIndex(n_thread_data_begin,
+                                             c_thread_data_begin,
+                                             hi_thread_data_begin,
+                                             wi_thread_data_begin),
                wei_thread_block_desc,
                p_wei_block +
                    wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
@@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        threadwise_4d_tensor_copy(out_thread_desc,
                                  p_out_thread,
                                  out_block_desc,
-                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                          k_thread_data_begin,
-                                                                          ho_thread_data_begin,
-                                                                          wo_thread_data_begin),
+                                  p_out_block +
+                                      out_block_desc.Get1dIndex(n_thread_data_begin,
+                                                                k_thread_data_begin,
+                                                                ho_thread_data_begin,
+                                                                wo_thread_data_begin),
                                  out_thread_desc.GetLengths());
    }
 }
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
        constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
        constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;

-        // loop over k
+// loop over k
 #pragma unroll
        for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
        {
-            // read first batch of A, B
-            //   copy A-sub to form A
+// read first batch of A, B
+//   copy A-sub to form A
 #pragma unroll
            for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
            {
@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    a_thread_sub_mtx.GetLengths());
            }

-            //   copy B-sub to form B
+//   copy B-sub to form B
 #pragma unroll
            for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
            {
@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    b_thread_sub_mtx.GetLengths());
            }

-            // loop over batch
+// loop over batch
 #pragma unroll
            for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
            {
@@ -557,8 +557,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
            {
                threadwise_matrix_copy(
                    c_thread_sub_mtx,
-                    p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
-                                                             n_repeat * NPerLevel1Cluster),
+                    p_c_thread +
+                        c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
+                                                    n_repeat * NPerLevel1Cluster),
                    c_block_mtx,
                    p_c_block +
                        c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
@@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC
            constexpr unsigned NClusterWork =
                (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);

-            static_assert(BlockSize == (MClusterWork * MThreadPerCluster) *
-                                           (NClusterWork * NThreadPerCluster),
+            static_assert(BlockSize ==
+                              (MClusterWork * MThreadPerCluster) *
+                                  (NClusterWork * NThreadPerCluster),
                          "wrong! wrong BlockSize");

            if(DistributeThreadAlongColumnFirst)
@@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                            p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
                            c_thread_sub_mtx,
                            False,
-                            p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
-                                                                 n_repeat * NPerThreadSubC),
+                            p_c_thread +
+                                c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
+                                                        n_repeat * NPerThreadSubC),
                            f_accum);
                    }
                }
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -65,7 +65,7 @@ struct vector_type<half_float::half, 8>
 };
 #endif

-#if 1
+#if 0
 template <>
 struct vector_type<half, 1>
 {
@@ -139,6 +139,7 @@ struct Sequence
    }
 };

+#if DEVICE_BACKEND_CUDA
 template <typename T>
 __host__ __device__ constexpr T max(T a, T b)
 {
@@ -150,6 +151,7 @@ __host__ __device__ constexpr T min(T a, T b)
 {
    return a < b ? a : b;
 }
+#endif

 __host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b)
 {
--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -4,7 +4,6 @@

 #if DEVICE_BACKEND_HIP
 #include "hip/hip_runtime.h"
-#include "half.hpp"
 #elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
 #include "nvToolsExt.h"
--- a/src/include/gridwise_direct_convolution_1.hip.hpp
+++ b/src/include/gridwise_direct_convolution_1.hip.hpp
@@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
        c_block_work_begin += CPerBlock)
    {
        // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin,
-                                                                      c_block_work_begin,
-                                                                      hi_block_work_begin,
-                                                                      wi_block_work_begin),
+        blockwise_in_copy.Run(p_in_global +
+                                  in_global_desc.Get1dIndex(n_block_work_begin,
+                                                            c_block_work_begin,
+                                                            hi_block_work_begin,
+                                                            wi_block_work_begin),
                              p_in_block);

        // copy weight tensor to LDS
@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
    }

    // copy output tensor from LDS to device mem
-    blockwise_out_copy.Run(p_out_block,
-                           p_out_global + out_global_desc.Get1dIndex(n_block_work_begin,
-                                                                     k_block_work_begin,
-                                                                     ho_block_work_begin,
-                                                                     wo_block_work_begin));
+    blockwise_out_copy.Run(
+        p_out_block,
+        p_out_global +
+            out_global_desc.Get1dIndex(
+                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
 }
--- a/src/include/gridwise_direct_convolution_2.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2.hip.hpp
@@ -139,10 +139,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
-                                                                      c_block_data_begin,
-                                                                      hi_block_data_begin,
-                                                                      wi_block_data_begin),
+        blockwise_in_copy.Run(p_in_global +
+                                  in_global_desc.Get1dIndex(n_block_data_begin,
+                                                            c_block_data_begin,
+                                                            hi_block_data_begin,
+                                                            wi_block_data_begin),
                              p_in_block);

        // copy weight tensor to LDS
@@ -158,10 +159,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
 #if 1
            threadwise_direct_convolution_2(
                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                p_in_block +
+                    in_block_desc.Get1dIndex(n_thread_data_begin,
+                                             c_thread_data,
+                                             hi_thread_data_begin,
+                                             wi_thread_data_begin),
                wei_thread_block_desc,
                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                out_thread_desc,
@@ -169,10 +171,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
 #elif 0
            threadwise_direct_convolution_3(
                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                p_in_block +
+                    in_block_desc.Get1dIndex(n_thread_data_begin,
+                                             c_thread_data,
+                                             hi_thread_data_begin,
+                                             wi_thread_data_begin),
                wei_thread_block_desc,
                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                out_thread_desc,
@@ -186,9 +189,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
        out_thread_desc,
        p_out_thread,
        out_global_desc,
-        p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                  k_block_data_begin + k_thread_data_begin,
-                                                  ho_block_data_begin + ho_thread_data_begin,
-                                                  wo_block_data_begin + wo_thread_data_begin),
+        p_out_global +
+            out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                       k_block_data_begin + k_thread_data_begin,
+                                       ho_block_data_begin + ho_thread_data_begin,
+                                       wo_block_data_begin + wo_thread_data_begin),
        out_thread_desc.GetLengths());
 }
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
@@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
    threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread);

    const Float* p_in_global_block_begin =
-        p_in_global + in_chwn_global_desc.Get1dIndex(
-                          0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
+        p_in_global +
+        in_chwn_global_desc.Get1dIndex(
+            0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);

    const Float* p_wei_global_block_begin =
        p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
        }
    }

-    // output: register to global mem,
+// output: register to global mem,
 #if 0
    const auto c_thread_mtx_begin =
        blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
@@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
        }
 #endif

-        threadwise_8d_tensor_copy(out_8d_thread_desc,
-                                  p_out_thread,
-                                  out_8d_global_desc,
-                                  p_out_global + out_khwn_global_desc.Get1dIndex(
-                                                     k_block_data_begin + k_thread_data_begin,
-                                                     ho_block_data_begin + ho_thread_data_begin,
-                                                     wo_block_data_begin + wo_thread_data_begin,
-                                                     n_block_data_begin + n_thread_data_begin),
-                                  out_8d_thread_desc.GetLengths(),
-                                  Number<OutThreadCopyDataPerWrite>{});
+        threadwise_8d_tensor_copy(
+            out_8d_thread_desc,
+            p_out_thread,
+            out_8d_global_desc,
+            p_out_global +
+                out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                ho_block_data_begin + ho_thread_data_begin,
+                                                wo_block_data_begin + wo_thread_data_begin,
+                                                n_block_data_begin + n_thread_data_begin),
+            out_8d_thread_desc.GetLengths(),
+            Number<OutThreadCopyDataPerWrite>{});
    }
    else if(NPerThread == NPerBlock)
    {
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
        out_hkwn_thread_desc,
        p_out_thread,
        out_khwn_global_desc,
-        p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+        p_out_global +
+            out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin,
+                                            n_block_data_begin + n_thread_data_begin),
        out_hkwn_thread_desc.GetLengths(),
        reorder_khwn_from_hkwn);
 }
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
                               decltype(in_cb_block_desc),
                               decltype(in_cb_block_desc.GetLengths())>{};
 #elif 0
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy2<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
                                                          Float,
                                                          decltype(in_cb_global_desc),
                                                          decltype(in_cb_block_desc),
@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
                                                          InBlockCopyThreadPerDim0,
                                                          InBlockCopyThreadPerDim1>{};
 #elif 1
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy3<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
                                                          Float,
                                                          decltype(in_cb_global_desc),
                                                          decltype(in_cb_block_desc),
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
    return os;
 }

-typedef enum
-{
+typedef enum {
    Half  = 0,
    Float = 1,
 } DataType_t;