From 23f633cdc5eafb6f110046794ad1384d723f1157 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 17 Jun 2019 20:14:18 -0500 Subject: [PATCH] clean up for miopen --- ...mm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 10 ++----- .../ConstantMatrixDescriptor.hpp | 8 ++--- .../tensor_operation/blockwise_gemm.hpp | 29 ++----------------- .../blockwise_generic_tensor_slice_copy.hpp | 6 ++-- .../tensor_operation/threadwise_gemm.hpp | 2 +- .../threadwise_generic_tensor_slice_copy.hpp | 1 + .../include/utility/Sequence.hpp | 2 +- .../include/utility/amd_inline_asm.hpp | 2 +- .../include/utility/config_amd.hpp.in | 4 +++ .../include/utility/config_nvidia.hpp.in | 4 +++ .../include/utility/functional.hpp | 2 +- composable_kernel/include/utility/utility.hpp | 4 --- 12 files changed, 22 insertions(+), 52 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 73cec0bb10..6193a1a4d3 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -66,10 +66,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; constexpr auto True = integral_constant{}; @@ -77,10 +74,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; - constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0); - constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); - constexpr index_t Hi = in_n_c_h_w_global_desc.GetLength(I2); - constexpr index_t Wi = in_n_c_h_w_global_desc.GetLength(I3); + constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0); + constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); constexpr index_t K = out_n_k_h_w_global_desc.GetLength(I1); constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2); @@ -346,7 +341,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer { constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster; - constexpr index_t K0 = K / (K1 * K2); // define tensor descriptor for threadwise copy // output memory layout descriptor in register diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index e24f16aa16..af7bc1d354 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -54,12 +54,8 @@ __host__ __device__ constexpr auto template __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s) { - const auto desc = TDesc{}; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride()); + printf( + "%s NRow %u NCol %u RowStride %u\n", s, TDesc::NRow(), TDesc::NCol(), TDesc::RowStride()); } } // namespace ck diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp index b1e6171f4e..2052747c65 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp @@ -49,7 +49,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr index_t M = BlockMatrixA::NCol(); // A is transposed constexpr index_t N = BlockMatrixB::NCol(); - constexpr index_t K = BlockMatrixA::NRow(); static_assert(M % (MPerThreadSubC * MLevel0Cluster * MLevel1Cluster) == 0 && N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, @@ -100,12 +99,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 { constexpr auto c_thread_mtx = ThreadMatrixC{}; - constexpr index_t MPerThread = c_thread_mtx.NRow(); - constexpr index_t NPerThread = c_thread_mtx.NCol(); - - constexpr index_t MRepeat = MPerThread / MPerThreadSubC; - constexpr index_t NRepeat = NPerThread / NPerThreadSubC; - constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster; constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; @@ -125,9 +118,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 const FloatB* __restrict__ p_b_block, FloatC* __restrict__ p_c_thread) const { - constexpr auto True = integral_constant{}; - constexpr auto False = integral_constant{}; - constexpr auto a_block_mtx = BlockMatrixA{}; constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto c_thread_mtx = ThreadMatrixC{}; @@ -146,13 +136,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr auto b_thread_mtx = make_ConstantMatrixDescriptor(Number{}, Number{}); - // thread A-sub, B-sub for copy - constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; @@ -172,9 +155,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 using Float4 = vector_type::MemoryType; - Float4* reg_a = (Float4*)(p_a_thread); - Float4* reg_b = (Float4*)(p_b_thread); - Float4* reg_c = (Float4*)(p_c_thread); + Float4* reg_a = reinterpret_cast(p_a_thread); + Float4* reg_b = reinterpret_cast(p_b_thread); + Float4* reg_c = reinterpret_cast(p_c_thread); reg_a[0] = *reinterpret_cast(&p_a_block[mMyThreadOffsetA]); reg_b[0] = *reinterpret_cast(&p_b_block[mMyThreadOffsetB]); @@ -215,8 +198,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto c_thread_mtx = ThreadMatrixC{}; - constexpr index_t M = a_block_mtx.NCol(); - constexpr index_t N = b_block_mtx.NCol(); constexpr index_t K = a_block_mtx.NRow(); constexpr index_t MPerThread = c_thread_mtx.NRow(); @@ -245,8 +226,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr index_t MRepeat = MPerThread / MPerThreadSubC; constexpr index_t NRepeat = NPerThread / NPerThreadSubC; - const FloatA* const p_a_block_thread_offset = p_a_block + mMyThreadOffsetA; - #pragma unroll // loop over k for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop) @@ -306,8 +285,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto c_thread_mtx = ThreadMatrixC{}; - constexpr index_t M = a_block_mtx.NCol(); - constexpr index_t N = b_block_mtx.NCol(); constexpr index_t K = a_block_mtx.NRow(); constexpr index_t MPerThread = c_thread_mtx.NRow(); diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index b59d4120ca..ed633158e8 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -93,8 +93,6 @@ struct BlockwiseGenericTensorSliceCopy_v1 "wrong! cannot evenly divide sliced tensor into cluster"); }); - constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims; - // for now, only support SubLengths == 1 on a merged dimension that constains // multiple original dimensions static_for<0, nDim, 1>{}([&](auto IDim_) { @@ -297,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto IDim = Number{}; constexpr index_t idim = IDim; - static_if{}([&](auto fwd) { + static_if{}([&](auto) { // logic for a merged dimension, also works for non-merged dimension, but its logic may // be unncessarily complicated for compiler to remove calculations that are useless for // a non-merged dimension @@ -337,7 +335,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 // update "mThreadSrcOffset", do "+" before "-" to avoid underflow mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; - }).Else([&](auto fwd) { + }).Else([&](auto) { // Logic for non-merged dimension. If you are never going to move the slicing window on // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets", // which are being calculated here, will never be used later. In this case, compiler diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp index d77ad2bbee..fb1540a985 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp @@ -71,7 +71,7 @@ __device__ void threadwise_gemm(MatrixA, integral_constant, FloatC* __restrict__ p_c_thread) { - static_if{}([&](auto fwd) { + static_if{}([&](auto) { constexpr auto a_mtx = MatrixA{}; constexpr auto b_mtx = MatrixB{}; constexpr auto c_mtx = MatrixC{}; diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 16e1e7a153..5cff460050 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -11,6 +11,7 @@ namespace ck { +// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1 template , Sequence) } template -__host__ __device__ constexpr auto operator-(Sequence seq_x, Sequence seq_y) +__host__ __device__ constexpr auto operator-(Sequence, Sequence) { static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index c4643543fc..f973a41b0d 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -102,7 +102,7 @@ __device__ void outerProduct1x4(const float& a, const vector_type::MemoryType& b, vector_type::MemoryType& c) { - outerProduct1x4(&a, (float*)&b, (float*)&c); + outerProduct1x4(&a, reinterpret_cast(&b), reinterpret_cast(&c)); } __device__ void outerProduct4x4(const vector_type::MemoryType& a, diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 404a9853c3..ea8a3356d6 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -19,6 +19,10 @@ typedef float float4_t __attribute__((ext_vector_type(4))); using index_t = uint32_t; +__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } + +__device__ index_t get_block_1d_id() { return blockIdx.x; } + __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) { d += s0 * s1; diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 39afc0b154..038448e9d1 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -23,6 +23,10 @@ using float4_t = float4; using index_t = uint32_t; +__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } + +__device__ index_t get_block_1d_id() { return blockIdx.x; } + __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) { d += s0 * s1; diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp index 56bdc4c06b..e1f6b004ce 100644 --- a/composable_kernel/include/utility/functional.hpp +++ b/composable_kernel/include/utility/functional.hpp @@ -18,7 +18,7 @@ struct forwarder struct swallow { template - __host__ __device__ constexpr swallow(Ts&&... ts) + __host__ __device__ constexpr swallow(Ts&&...) { } }; diff --git a/composable_kernel/include/utility/utility.hpp b/composable_kernel/include/utility/utility.hpp index f52f163a7d..fbf86610b1 100644 --- a/composable_kernel/include/utility/utility.hpp +++ b/composable_kernel/include/utility/utility.hpp @@ -6,10 +6,6 @@ namespace ck { -__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } - -__device__ index_t get_block_1d_id() { return blockIdx.x; } - template using is_same = std::is_same;