From 23f633cdc5eafb6f110046794ad1384d723f1157 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 17 Jun 2019 20:14:18 -0500
Subject: [PATCH] clean up for miopen

---
 ...mm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 10 ++-----
 .../ConstantMatrixDescriptor.hpp              |  8 ++---
 .../tensor_operation/blockwise_gemm.hpp       | 29 ++-----------------
 .../blockwise_generic_tensor_slice_copy.hpp   |  6 ++--
 .../tensor_operation/threadwise_gemm.hpp      |  2 +-
 .../threadwise_generic_tensor_slice_copy.hpp  |  1 +
 .../include/utility/Sequence.hpp              |  2 +-
 .../include/utility/amd_inline_asm.hpp        |  2 +-
 .../include/utility/config_amd.hpp.in         |  4 +++
 .../include/utility/config_nvidia.hpp.in      |  4 +++
 .../include/utility/functional.hpp            |  2 +-
 composable_kernel/include/utility/utility.hpp |  4 ---
 12 files changed, 22 insertions(+), 52 deletions(-)
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 73cec0bb10..6193a1a4d3 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -66,10 +66,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
         constexpr auto I5 = Number<5>{};
-        constexpr auto I6 = Number<6>{};
-        constexpr auto I7 = Number<7>{};
 
         constexpr auto True = integral_constant<bool, true>{};
 
@@ -77,10 +74,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
         constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
         constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
 
-        constexpr index_t N  = in_n_c_h_w_global_desc.GetLength(I0);
-        constexpr index_t C  = in_n_c_h_w_global_desc.GetLength(I1);
-        constexpr index_t Hi = in_n_c_h_w_global_desc.GetLength(I2);
-        constexpr index_t Wi = in_n_c_h_w_global_desc.GetLength(I3);
+        constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0);
+        constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1);
 
         constexpr index_t K  = out_n_k_h_w_global_desc.GetLength(I1);
         constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2);
@@ -346,7 +341,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
         {
             constexpr index_t K2 = GemmMPerThreadSubC;
             constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster;
-            constexpr index_t K0 = K / (K1 * K2);
 
             // define tensor descriptor for threadwise copy
             //     output memory layout descriptor in register
diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
index e24f16aa16..af7bc1d354 100644
--- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
@@ -54,12 +54,8 @@ __host__ __device__ constexpr auto
 template <class TDesc>
 __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
 {
-    const auto desc = TDesc{};
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
-    printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride());
+    printf(
+        "%s NRow %u NCol %u RowStride %u\n", s, TDesc::NRow(), TDesc::NCol(), TDesc::RowStride());
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
index b1e6171f4e..2052747c65 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -49,7 +49,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
 
         constexpr index_t M = BlockMatrixA::NCol(); // A is transposed
         constexpr index_t N = BlockMatrixB::NCol();
-        constexpr index_t K = BlockMatrixA::NRow();
 
         static_assert(M % (MPerThreadSubC * MLevel0Cluster * MLevel1Cluster) == 0 &&
                           N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
@@ -100,12 +99,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
     {
         constexpr auto c_thread_mtx = ThreadMatrixC{};
 
-        constexpr index_t MPerThread = c_thread_mtx.NRow();
-        constexpr index_t NPerThread = c_thread_mtx.NCol();
-
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-
         constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
         constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
 
@@ -125,9 +118,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                                 const FloatB* __restrict__ p_b_block,
                                 FloatC* __restrict__ p_c_thread) const
     {
-        constexpr auto True  = integral_constant<bool, true>{};
-        constexpr auto False = integral_constant<bool, false>{};
-
         constexpr auto a_block_mtx  = BlockMatrixA{};
         constexpr auto b_block_mtx  = BlockMatrixB{};
         constexpr auto c_thread_mtx = ThreadMatrixC{};
@@ -146,13 +136,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr auto b_thread_mtx =
             make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
 
-        // thread A-sub, B-sub for copy
-        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
-
-        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
-
         FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
         FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
 
@@ -172,9 +155,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
 
         using Float4 = vector_type<float, 4>::MemoryType;
 
-        Float4* reg_a = (Float4*)(p_a_thread);
-        Float4* reg_b = (Float4*)(p_b_thread);
-        Float4* reg_c = (Float4*)(p_c_thread);
+        Float4* reg_a = reinterpret_cast<Float4*>(p_a_thread);
+        Float4* reg_b = reinterpret_cast<Float4*>(p_b_thread);
+        Float4* reg_c = reinterpret_cast<Float4*>(p_c_thread);
 
         reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]);
         reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]);
@@ -215,8 +198,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr auto b_block_mtx  = BlockMatrixB{};
         constexpr auto c_thread_mtx = ThreadMatrixC{};
 
-        constexpr index_t M = a_block_mtx.NCol();
-        constexpr index_t N = b_block_mtx.NCol();
         constexpr index_t K = a_block_mtx.NRow();
 
         constexpr index_t MPerThread = c_thread_mtx.NRow();
@@ -245,8 +226,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
         constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
 
-        const FloatA* const p_a_block_thread_offset = p_a_block + mMyThreadOffsetA;
-
 #pragma unroll
         // loop over k
         for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
@@ -306,8 +285,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr auto b_block_mtx  = BlockMatrixB{};
         constexpr auto c_thread_mtx = ThreadMatrixC{};
 
-        constexpr index_t M = a_block_mtx.NCol();
-        constexpr index_t N = b_block_mtx.NCol();
         constexpr index_t K = a_block_mtx.NRow();
 
         constexpr index_t MPerThread = c_thread_mtx.NRow();
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index b59d4120ca..ed633158e8 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -93,8 +93,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
                           "wrong! cannot evenly divide sliced tensor into cluster");
         });
 
-        constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
-
         // for now, only support SubLengths == 1 on a merged dimension that constains
         // multiple original dimensions
         static_for<0, nDim, 1>{}([&](auto IDim_) {
@@ -297,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
         constexpr auto IDim    = Number<IDim_>{};
         constexpr index_t idim = IDim;
 
-        static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) {
+        static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
             // logic for a merged dimension, also works for non-merged dimension, but its logic may
             // be unncessarily complicated for compiler to remove calculations that are useless for
             // a non-merged dimension
@@ -337,7 +335,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
 
             // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
             mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
-        }).Else([&](auto fwd) {
+        }).Else([&](auto) {
             // Logic for non-merged dimension. If you are never going to move the slicing window on
             // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
             // which are being calculated here, will never be used later. In this case, compiler
diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
index d77ad2bbee..fb1540a985 100644
--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -71,7 +71,7 @@ __device__ void threadwise_gemm(MatrixA,
                                 integral_constant<bool, TransC>,
                                 FloatC* __restrict__ p_c_thread)
 {
-    static_if<TransA && (!TransB) && (!TransC)>{}([&](auto fwd) {
+    static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) {
         constexpr auto a_mtx = MatrixA{};
         constexpr auto b_mtx = MatrixB{};
         constexpr auto c_mtx = MatrixC{};
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 16e1e7a153..5cff460050 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -11,6 +11,7 @@
 
 namespace ck {
 
+// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
 template <class Float,
           class SrcDesc,
           class DstDesc,
diff --git a/composable_kernel/include/utility/Sequence.hpp b/composable_kernel/include/utility/Sequence.hpp
index cf37b95ec2..a579bef8c5 100644
--- a/composable_kernel/include/utility/Sequence.hpp
+++ b/composable_kernel/include/utility/Sequence.hpp
@@ -253,7 +253,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
 }
 
 template <index_t... Xs, index_t... Ys>
-__host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys...> seq_y)
+__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Sequence<Ys...>)
 {
     static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
 
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index c4643543fc..f973a41b0d 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -102,7 +102,7 @@ __device__ void outerProduct1x4(const float& a,
                                 const vector_type<float, 4>::MemoryType& b,
                                 vector_type<float, 4>::MemoryType& c)
 {
-    outerProduct1x4(&a, (float*)&b, (float*)&c);
+    outerProduct1x4(&a, reinterpret_cast<const float*>(&b), reinterpret_cast<float*>(&c));
 }
 
 __device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index 404a9853c3..ea8a3356d6 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -19,6 +19,10 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
 
 using index_t = uint32_t;
 
+__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
+
+__device__ index_t get_block_1d_id() { return blockIdx.x; }
+
 __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
     d += s0 * s1;
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in
index 39afc0b154..038448e9d1 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -23,6 +23,10 @@ using float4_t = float4;
 
 using index_t = uint32_t;
 
+__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
+
+__device__ index_t get_block_1d_id() { return blockIdx.x; }
+
 __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
     d += s0 * s1;
diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp
index 56bdc4c06b..e1f6b004ce 100644
--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -18,7 +18,7 @@ struct forwarder
 struct swallow
 {
     template <class... Ts>
-    __host__ __device__ constexpr swallow(Ts&&... ts)
+    __host__ __device__ constexpr swallow(Ts&&...)
     {
     }
 };
diff --git a/composable_kernel/include/utility/utility.hpp b/composable_kernel/include/utility/utility.hpp
index f52f163a7d..fbf86610b1 100644
--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
@@ -6,10 +6,6 @@
 
 namespace ck {
 
-__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
-
-__device__ index_t get_block_1d_id() { return blockIdx.x; }
-
 template <class X, class Y>
 using is_same = std::is_same<X, Y>;