experiment with hip compiler

2026-07-17 09:08:35 +00:00 · 2019-03-22 19:40:01 -05:00
parent 52ae168b17
commit 68ae0731f1
4 changed files with 157 additions and 79 deletions
--- a/build/cmake-hip.sh
+++ b/build/cmake-hip.sh
@@ -11,6 +11,9 @@ cmake
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                 \
 -D DEVICE_BACKEND="HIP"                                                                     \
+-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only"                                  \
+-D CMAKE_CXX_FLAGS="-gline-tables-only"                                                     \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                   \
+-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep"                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 ${MY_PROJECT_SOURCE}
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -593,9 +593,9 @@ int main(int argc, char* argv[])
    constexpr unsigned HPad = 0;
    constexpr unsigned WPad = 0;
 #elif 1
-    // 1x1 filter, 14x14 image, C = 256
+    // 1x1 filter, 14x14 image, C = 512
    constexpr unsigned N  = 128;
-    constexpr unsigned C  = 256;
+    constexpr unsigned C  = 512;
    constexpr unsigned HI = 14;
    constexpr unsigned WI = 14;
    constexpr unsigned K  = 512;
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -435,12 +435,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
 #pragma unroll
        for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
        {
-            // read first batch of A, B
-            //   copy A-sub to form A
-            //#pragma unroll
+// read first batch of A, B
+//   copy A-sub to form A
+#pragma unroll
            for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
            {
-#if 0
                threadwise_matrix_copy(
                    a_block_mtx,
                    p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
@@ -448,25 +447,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    a_thread_mtx,
                    p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
                    a_thread_sub_mtx.GetLengths());
-#else
-                for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
-                {
-                    for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
-                    {
-                        p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] =
-                            p_a_block[a_block_mtx.Get1dIndex(k_begin + i,
-                                                             m_repeat * MPerLevel1Cluster + j) +
-                                      mMyThreadOffsetA];
-                    }
-                }
-#endif
            }

-            //   copy B-sub to form B
-            //#pragma unroll
+//   copy B-sub to form B
+#pragma unroll
            for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
            {
-#if 0
                threadwise_matrix_copy(
                    b_block_mtx,
                    p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
@@ -474,26 +460,13 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    b_thread_mtx,
                    p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
                    b_thread_sub_mtx.GetLengths());
-#else
-                for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
-                {
-                    for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
-                    {
-                        p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] =
-                            p_b_block[b_block_mtx.Get1dIndex(k_begin + i,
-                                                             n_repeat * MPerLevel1Cluster + j) +
-                                      mMyThreadOffsetB];
-                    }
-                }
-#endif
            }

-            // loop over batch
-            //#pragma unroll
+// loop over batch
+#pragma unroll
            for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
            {
-// do current batch of gemm
-#if 0
+                // do current batch of gemm
                threadwise_gemm(a_thread_mtx,
                                True,
                                p_a_thread,
@@ -504,7 +477,140 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                                False,
                                p_c_thread + ib * ThreadMatrixStrideC,
                                f_accum);
-#else
+
+                // read next batch of a, b
+                if(BlockMatrixStrideA != 0)
+                {
+#pragma unroll
+                    for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
+                    {
+                        threadwise_matrix_copy(
+                            a_block_mtx,
+                            p_a_block +
+                                a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
+                                (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA,
+                            a_thread_mtx,
+                            p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                            a_thread_sub_mtx.GetLengths());
+                    }
+                }
+
+                if(BlockMatrixStrideB != 0)
+                {
+#pragma unroll
+                    for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
+                    {
+                        threadwise_matrix_copy(
+                            b_block_mtx,
+                            p_b_block +
+                                b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
+                                (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB,
+                            b_thread_mtx,
+                            p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                            b_thread_sub_mtx.GetLengths());
+                    }
+                }
+            }
+
+            // do last batch of gemm
+            threadwise_gemm(a_thread_mtx,
+                            True,
+                            p_a_thread,
+                            b_thread_mtx,
+                            False,
+                            p_b_thread,
+                            c_thread_mtx,
+                            False,
+                            p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
+                            f_accum);
+        }
+    }
+
+    // this version put copy and compute in same place, experimenting with compiler behaviour
+    template <class FloatA, class FloatB, class FloatC, class Accumulator>
+    __device__ void Run_v2(const FloatA* __restrict__ p_a_block,
+                           const FloatB* __restrict__ p_b_block,
+                           FloatC* __restrict__ p_c_thread,
+                           Accumulator f_accum) const
+    {
+        constexpr auto True  = integral_constant<bool, true>{};
+        constexpr auto False = integral_constant<bool, false>{};
+
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};
+
+        constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed
+
+        constexpr unsigned MPerThread = c_thread_mtx.NRow();
+        constexpr unsigned NPerThread = c_thread_mtx.NCol();
+
+        // thread A, B for GEMM
+        //   A is transposed, b is not
+        constexpr auto a_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
+
+        constexpr auto b_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
+
+        // thread A-sub, B-sub for copy
+        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
+
+        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
+
+        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
+        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
+
+        constexpr unsigned MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
+        constexpr unsigned NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
+
+        constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
+        constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
+
+        // loop over k
+        //#pragma unroll
+        for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
+        {
+            // read first batch of A, B
+            //   copy A-sub to form A
+            //#pragma unroll
+            for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
+            {
+                for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
+                {
+                    for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
+                    {
+                        p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] =
+                            p_a_block[a_block_mtx.Get1dIndex(k_begin + i,
+                                                             m_repeat * MPerLevel1Cluster + j) +
+                                      mMyThreadOffsetA];
+                    }
+                }
+            }
+
+            //   copy B-sub to form B
+            //#pragma unroll
+            for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
+            {
+                for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
+                {
+                    for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
+                    {
+                        p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] =
+                            p_b_block[b_block_mtx.Get1dIndex(k_begin + i,
+                                                             n_repeat * MPerLevel1Cluster + j) +
+                                      mMyThreadOffsetB];
+                    }
+                }
+            }
+
+            // loop over batch
+            //#pragma unroll
+            for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
+            {
+                // do current batch of gemm
                for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
                {
                    for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
@@ -521,7 +627,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                        }
                    }
                }
-#endif

                // read next batch of a, b
                if(BlockMatrixStrideA != 0)
@@ -529,16 +634,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    //#pragma unroll
                    for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
                    {
-#if 0
-                        threadwise_matrix_copy(
-                            a_block_mtx,
-                            p_a_block +
-                                a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
-                                (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA,
-                            a_thread_mtx,
-                            p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
-                            a_thread_sub_mtx.GetLengths());
-#else
                        for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
                        {
                            for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
@@ -550,7 +645,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                                              (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA];
                            }
                        }
-#endif
                    }
                }

@@ -559,16 +653,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    //#pragma unroll
                    for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
                    {
-#if 0
-                        threadwise_matrix_copy(
-                            b_block_mtx,
-                            p_b_block +
-                                b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
-                                (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB,
-                            b_thread_mtx,
-                            p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
-                            b_thread_sub_mtx.GetLengths());
-#else
                        for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
                        {
                            for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
@@ -580,24 +664,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                                              (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB];
                            }
                        }
-#endif
                    }
                }
            }

-// do last batch of gemm
-#if 0
-            threadwise_gemm(a_thread_mtx,
-                            True,
-                            p_a_thread,
-                            b_thread_mtx,
-                            False,
-                            p_b_thread,
-                            c_thread_mtx,
-                            False,
-                            p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
-                            f_accum);
-#else
+            // do last batch of gemm
            for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
            {
                for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
@@ -613,7 +684,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    }
                }
            }
-#endif
        }
    }

--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
@@ -209,10 +209,15 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
        {
            for(unsigned x = 0; x < X; ++x)
            {
-                blockwise_batch_gemm.Run(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                                         p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0),
-                                         p_out_thread,
-                                         [](auto& acc, const auto&& v) { acc += v; });
+#if 1
+                blockwise_batch_gemm.Run
+#elif 0
+                blockwise_batch_gemm.Run_v2
+#endif
+                    (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                     p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0),
+                     p_out_thread,
+                     [](auto& acc, const auto&& v) { acc += v; });
            }
        }
    }