From 33d1e0e2e54bed6f155dda50bd8d8796b5f20adf Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 17 Jun 2019 14:58:44 -0500 Subject: [PATCH] refactoring for miopen --- ...tion_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp | 2 +- ...tion_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp | 2 +- ..._v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp | 2 +- ...tion_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp | 2 +- ..._v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp | 2 +- ...lution_implicit_gemm_v2_chwn_cyxk_khwn.hpp | 2 +- ...mm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp | 6 +- ...lution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp | 2 +- ...mm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp | 2 +- ...lution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp | 2 +- ...mm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 25 +-- .../ConstantTensorDescriptor.hpp | 11 +- .../blockwise_2d_tensor_op.hpp | 8 +- .../blockwise_3d_tensor_op.hpp | 2 +- .../blockwise_4d_tensor_op.hpp | 2 +- .../blockwise_batched_gemm.hpp | 38 ++-- .../tensor_operation/blockwise_gemm.hpp | 48 +++-- .../blockwise_generic_tensor_slice_copy.hpp | 99 ++++------- .../blockwise_tensor_slice_copy.hpp | 7 +- .../threadwise_4d_tensor_op.hpp | 8 +- .../tensor_operation/threadwise_gemm.hpp | 27 +-- .../threadwise_generic_tensor_slice_copy.hpp | 63 +++---- .../threadwise_tensor_slice_copy.hpp | 7 +- composable_kernel/include/utility/Array.hpp | 2 +- .../include/utility/Sequence.hpp | 87 +-------- .../include/utility/amd_inline_asm.hpp | 168 ------------------ .../include/utility/config_amd.hpp.in | 3 + .../include/utility/config_nvidia.hpp.in | 3 + .../include/utility/functional.hpp | 6 +- .../include/utility/integral_constant.hpp | 17 +- composable_kernel/include/utility/utility.hpp | 28 +-- .../include/utility/vector_type.hpp | 125 ------------- driver/include/tensor.hpp | 4 +- 33 files changed, 195 insertions(+), 617 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp index eca22ce632..5dcf4f415e 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp @@ -264,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn #if 1 blockwise_batch_gemm.Run #else - blockwise_batch_gemm.Run_asm + blockwise_batch_gemm.Run_amd_asm #endif (p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp index c54eb80dd9..1455e1797b 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp @@ -191,7 +191,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn #if 0 return blockwise_batch_gemm.Run(Xs...); #elif 0 - return blockwise_batch_gemm.Run_asm(Xs...); + return blockwise_batch_gemm.Run_amd_asm(Xs...); #else return blockwise_batch_gemm.Run_asm_v2(Xs...); #endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp index f421cfa4c3..c5e1712332 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -206,7 +206,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer #if 1 return blockwise_batch_gemm.Run(Xs...); #elif 0 - return blockwise_batch_gemm.Run_asm(Xs...); + return blockwise_batch_gemm.Run_amd_asm(Xs...); #else return blockwise_batch_gemm.Run_asm_v2(Xs...); #endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp index 447ce4ce52..3636db9fa6 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp @@ -201,7 +201,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw #if 1 return blockwise_batch_gemm.Run(Xs...); #elif 0 - return blockwise_batch_gemm.Run_asm(Xs...); + return blockwise_batch_gemm.Run_amd_asm(Xs...); #else return blockwise_batch_gemm.Run_asm_v2(Xs...); #endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp index 3c71a5afc4..5a7267d75c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp @@ -203,7 +203,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer #if 1 return blockwise_batch_gemm.Run(Xs...); #elif 0 - return blockwise_batch_gemm.Run_asm(Xs...); + return blockwise_batch_gemm.Run_amd_asm(Xs...); #else return blockwise_batch_gemm.Run_asm_v2(Xs...); #endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp index a1525e74ee..b39bb66a2c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp @@ -238,7 +238,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 1 - blockwise_gemm.Run_asm + blockwise_gemm.Run_amd_asm #endif (p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_in_block + y * Wi + x, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp index 31832b2ef9..52abeab530 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -271,7 +271,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 0 - blockwise_gemm.Run_asm + blockwise_gemm.Run_amd_asm #endif (p_wei_block_now + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), @@ -313,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 0 - blockwise_gemm.Run_asm + blockwise_gemm.Run_amd_asm #endif (p_wei_block_double + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), @@ -339,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 0 - blockwise_gemm.Run_asm + blockwise_gemm.Run_amd_asm #endif (p_wei_block_double + wei_block_space + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp index d1b77f06e7..70737ebc6c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp @@ -217,7 +217,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw #if 1 return blockwise_gemm.Run(Xs...); #else - return blockwise_gemm.Run_asm(Xs...); + return blockwise_gemm.Run_amd_asm(Xs...); #endif }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp index 0d5b520c53..ca8412355e 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp @@ -214,7 +214,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer #if 1 return blockwise_gemm.Run(Xs...); #else - return blockwise_gemm.Run_asm(Xs...); + return blockwise_gemm.Run_amd_asm(Xs...); #endif }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp index f6535b3d7b..461db757bd 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp @@ -247,7 +247,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw #if 1 return blockwise_gemm.Run(Xs...); #else - return blockwise_gemm.Run_asm(Xs...); + return blockwise_gemm.Run_amd_asm(Xs...); #endif }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 433ba2d855..2b41e2640e 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -9,10 +9,6 @@ #include "blockwise_gemm.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" -#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM -#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1 -#endif - namespace ck { // define B = merge(N0, Ho, Wo) @@ -239,15 +235,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer GemmDataPerReadA, GemmDataPerReadB>{}; - // choose GEMM implementation here - const auto run_blockwise_gemm = [&](auto... Xs) { -#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM - return blockwise_gemm.Run_asm(Xs...); -#else - return blockwise_gemm.Run(Xs...); -#endif - }; - // LDS allocation for input and weight: be careful of alignment constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2, WeiBlockCopyDstDataPerWrite_K, @@ -255,9 +242,11 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer GemmDataPerReadB); constexpr index_t in_block_space = - in_e_n1_b_n2_block_desc.GetElementSpace(Number{}); + math::integer_divide_ceil(in_e_n1_b_n2_block_desc.GetElementSpace(), max_align) * + max_align; - constexpr index_t wei_block_space = wei_e_k_block_desc.GetElementSpace(Number{}); + constexpr index_t wei_block_space = + math::integer_divide_ceil(wei_e_k_block_desc.GetElementSpace(), max_align) * max_align; __shared__ Float p_in_block_double[2 * in_block_space]; __shared__ Float p_wei_block_double[2 * wei_block_space]; @@ -309,7 +298,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer p_wei_register_clipboard); // LDS double buffer: GEMM on current data - run_blockwise_gemm(p_wei_block_now, p_in_block_now, p_out_thread); + blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, @@ -336,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer p_wei_register_clipboard); // LDS double buffer: GEMM on current data - run_blockwise_gemm(p_wei_block_double, p_in_block_double, p_out_thread); + blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, @@ -348,7 +337,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS double buffer: GEMM on current data - run_blockwise_gemm(p_wei_block_double + wei_block_space, + blockwise_gemm.Run(p_wei_block_double + wei_block_space, p_in_block_double + in_block_space, p_out_thread); } diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp index f93a5a60cd..a8295d6624 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp @@ -107,15 +107,12 @@ struct ConstantTensorDescriptor return accumulate_on_sequence(Lengths{}, math::multiplies{}, Number<1>{}); } - template > - __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{}) + __host__ __device__ static constexpr index_t GetElementSpace() { - // This is WRONG! align shouldbe applied to the last memory rank, not the last tensor - // dimension constexpr index_t element_space_unaligned = accumulate_on_sequence( (GetLengths() - Number<1>{}) * GetStrides(), math::plus{}, Number<1>{}); - return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); + return element_space_unaligned; } // emulate constexpr lambda @@ -234,7 +231,7 @@ struct ConstantTensorDescriptor // do carry check in reversed order, starting from lowest dimension // don't check the highest dimension static_for<0, nDim, 1>{}([&](auto IDimReverse) { - constexpr index_t idim = nDim - 1 - IDimReverse.Get(); + constexpr index_t idim = nDim - 1 - IDimReverse; constexpr auto IDim = Number{}; if(carry) @@ -260,7 +257,7 @@ struct ConstantTensorDescriptor // do borrow check in reversed order, starting from lowest dimension // don't check the highest dimension static_for<0, nDim, 1>{}([&](auto IDimReverse) { - constexpr index_t idim = nDim - 1 - IDimReverse.Get(); + constexpr index_t idim = nDim - 1 - IDimReverse; constexpr auto IDim = Number{}; if(borrow) diff --git a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp index 9354da7861..527c84b67a 100644 --- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp @@ -286,7 +286,7 @@ struct Blockwise2dTensorCopy2 __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - static_assert(is_same::value, "wrong! only support float!\n"); + static_assert(is_same{}, "wrong! only support float!\n"); using Float4 = float4; using Float2 = float2; @@ -565,7 +565,7 @@ struct Blockwise2dTensorCopy3 __device__ constexpr index_t GetRegisterClipboardSize() const { - static_assert(is_same::value, "wrong! only support float!\n"); + static_assert(is_same{}, "wrong! only support float!\n"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -714,7 +714,7 @@ struct Blockwise2dTensorCopy3 *(reinterpret_cast(&p_src[mSrcMyThreadOffset + iloop * src_loop_stride])); #else - static_assert(is_same::value && DataPerRead == 4, + static_assert(is_same{} && DataPerRead == 4, "global_load is only for float4"); global_load(reinterpret_cast(p_clipboard[iloop * DataPerRead]), @@ -773,7 +773,7 @@ struct Blockwise2dTensorCopy3 *(reinterpret_cast(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) = *(reinterpret_cast(&p_clipboard[iloop * DataPerRead]); #else - static_assert(is_same::value && DataPerRead == 4, + static_assert(is_same{} && DataPerRead == 4, "ds_write_b128 is only for float4"); ds_write_b128(reinterpret_cast(p_clipboard[iloop * DataPerRead]), diff --git a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp index fb2eafa160..e328caf495 100644 --- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp @@ -239,7 +239,7 @@ struct Blockwise3dTensorCopy3 __device__ static constexpr index_t GetRegisterClipboardSize() { - static_assert(is_same::value, "wrong! only support float!\n"); + static_assert(is_same{}, "wrong! only support float!\n"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp index 6f65ce077d..95fcd28023 100644 --- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp @@ -598,7 +598,7 @@ struct Blockwise4dTensorCopy3 __device__ constexpr index_t GetRegisterClipboardSize() const { - static_assert(is_same::value, "wrong! only support float!\n"); + static_assert(is_same{}, "wrong! only support float!\n"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp index 0b1c9e2a53..36d3060b23 100644 --- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp @@ -295,9 +295,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 #if CK_USE_AMD_INLINE_ASM template - __device__ void Run_asm(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread) const + __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread) const { constexpr auto a_block_mtx = BlockMatrixA{}; constexpr auto b_block_mtx = BlockMatrixB{}; @@ -330,19 +330,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; // assertion for inline asm - static_assert(is_same::value && is_same::value && - is_same::value, - "Run_asm only deal with float\n"); + static_assert(is_same{} && is_same{} && + is_same{}, + "Run_amd_asm only deal with float\n"); static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && MPerThread == 8 && NPerThread == 8, - "Run_asm cannot deal with this GEMM shape yet\n"); + "Run_amd_asm cannot deal with this GEMM shape yet\n"); - static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n"); + static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n"); - static_assert( - BlockMatrixStrideA == 0 && BatchPerThread == 1, - "Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n"); + static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1, + "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == " + "1 for now\n"); using Float4 = vector_type::MemoryType; @@ -421,19 +421,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; // assertion for inline asm - static_assert(is_same::value && is_same::value && - is_same::value, - "Run_asm only deal with float\n"); + static_assert(is_same{} && is_same{} && + is_same{}, + "Run_amd_asm only deal with float\n"); static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && MPerThread == 8 && NPerThread == 8, - "Run_asm cannot deal with this GEMM shape yet\n"); + "Run_amd_asm cannot deal with this GEMM shape yet\n"); - static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n"); + static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n"); - static_assert( - BlockMatrixStrideA == 0 && BatchPerThread == 1, - "Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n"); + static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1, + "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == " + "1 for now\n"); using Float4 = vector_type::MemoryType; diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp index f257137f3c..b1e6171f4e 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp @@ -5,6 +5,10 @@ #include "ConstantMatrixDescriptor.hpp" #include "threadwise_gemm.hpp" +#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM +#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1 +#endif + namespace ck { // if following number are power of 2, index calculation shall be greatly reduced: @@ -51,7 +55,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, "wrong! Cannot evenly divide work among\n"); - static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()), + static_assert(std::is_same{}, "wrong! ThreadMatrixC lengths is wrong"); auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -115,11 +120,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 } #if CK_USE_AMD_INLINE_ASM - // TODO: this is not working correctly template - __device__ void Run_asm(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread) const + __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread) const { constexpr auto True = integral_constant{}; constexpr auto False = integral_constant{}; @@ -156,15 +160,15 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; // assertion for inline asm - static_assert(is_same::value && is_same::value && - is_same::value, - "Run_asm only deal with float\n"); + static_assert(is_same{} && is_same{} && + is_same{}, + "Run_amd_asm only deal with float"); static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && MPerThread == 8 && NPerThread == 8, - "Run_asm cannot deal with this GEMM shape yet\n"); + "Run_amd_asm cannot deal with this GEMM shape yet"); - static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n"); + static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read"); using Float4 = vector_type::MemoryType; @@ -200,9 +204,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 #endif template - __device__ void Run(const FloatA* const __restrict__ p_a_block, - const FloatB* const __restrict__ p_b_block, - FloatC* const __restrict__ p_c_thread) const + __device__ void Run_source(const FloatA* const __restrict__ p_a_block, + const FloatB* const __restrict__ p_b_block, + FloatC* const __restrict__ p_c_thread) const { constexpr auto True = integral_constant{}; constexpr auto False = integral_constant{}; @@ -291,9 +295,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 } template - __device__ void Run_RegisterDoubleBuffer(FloatA* const p_a_block, - FloatB* const p_b_block, - FloatC* p_c_thread) const + __device__ void RunRegisterDoubleBuffer_source(FloatA* const p_a_block, + FloatB* const p_b_block, + FloatC* p_c_thread) const { constexpr auto True = integral_constant{}; constexpr auto False = integral_constant{}; @@ -427,6 +431,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 p_c_thread); } } + template + __device__ void Run(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread) const + + { +#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM + Run_amd_asm(p_a_block, p_b_block, p_c_thread); +#else + Run_source(p_a_block, p_b_block, p_c_thread); +#endif + } }; } // namespace ck diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 1496630543..b59d4120ca 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -6,6 +6,10 @@ #include "ConstantMergedTensorDescriptor.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 +#endif + namespace ck { // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor @@ -91,7 +95,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims; - // for now, only support SubLengths.Get() == 1 on a merged dimension that constains + // for now, only support SubLengths == 1 on a merged dimension that constains // multiple original dimensions static_for<0, nDim, 1>{}([&](auto IDim_) { constexpr auto IDim = decltype(IDim_){}; @@ -121,7 +125,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 // partial offset on each dimension static_for<0, nDim, 1>{}([&](auto IDim_) { constexpr auto IDim = decltype(IDim_){}; - constexpr index_t idim = IDim.Get(); + constexpr index_t idim = IDim; constexpr auto src_partial_original_dims = SrcDesc::GetContainedOriginalDimensions(IDim); @@ -135,7 +139,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 static_for<0, nDim, 1>{}([&](auto IDim_) { constexpr auto IDim = decltype(IDim_){}; - constexpr index_t idim = IDim.Get(); + constexpr index_t idim = IDim; constexpr auto dst_partial_original_dims = DstDesc::GetContainedOriginalDimensions(IDim); @@ -153,38 +157,6 @@ struct BlockwiseGenericTensorSliceCopy_v1 mThreadDstOffset = accumulate_on_array( mThreadDstPartialOffsets, math::plus{}, static_cast(0)); - -#if 0 - if(get_block_1d_id() == 0) - { - printf("id %5u %5u: " - "src_block_data_multi_id_begin: %u %u %u %u, " - "thread_cluster_multi_id: %u %u %u %u, " - "data_cluster_multi_id: %u %u %u %u, " - "thread_data_multi_id_begin: %u %u %u %u, " - "mThreadSrcOffset %u, mThreadDstOffset %u \n", - get_block_1d_id(), - get_thread_local_1d_id(), - src_block_data_multi_id_begin[0], - src_block_data_multi_id_begin[1], - src_block_data_multi_id_begin[2], - src_block_data_multi_id_begin[3], - thread_cluster_multi_id[0], - thread_cluster_multi_id[1], - thread_cluster_multi_id[2], - thread_cluster_multi_id[3], - data_cluster_multi_id[0], - data_cluster_multi_id[1], - data_cluster_multi_id[2], - data_cluster_multi_id[3], - thread_data_multi_id_begin[0], - thread_data_multi_id_begin[1], - thread_data_multi_id_begin[2], - thread_data_multi_id_begin[3], - mThreadSrcOffset, - mThreadDstOffset); - } -#endif } __device__ static constexpr index_t GetRegisterClipboardSize() @@ -210,19 +182,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { -#if 0 - constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); - - const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; - - const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; - - const index_t src_offset = - SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); - - const index_t clipboard_offset = - thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); -#else // HIP compiler performs better with these codes +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto src_thread_data_multi_id_begin = @@ -236,6 +196,18 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); +#else + constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); + + const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; + + const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; + + const index_t src_offset = + SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); + + const index_t clipboard_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); #endif threadwise_generic_tensor_slice_copy_v1(SrcDesc{}, @@ -263,18 +235,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { -#if 0 - constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); - - const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; - - const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; - - const index_t clipboard_offset = - thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); - - const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); -#else // HIP compiler performs better with these codes +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto clipboard_data_multi_id_begin = @@ -287,6 +248,17 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); +#else + constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); + + const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; + + const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; + + const index_t clipboard_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); + + const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); #endif threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc, @@ -310,7 +282,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 } // When moving the slicing windows along a merged dimension, if the strides of the - // contained (by the merged dimension) original dimensions are in descending order, + // contained (by the merged dimension) original dimensions are not in descending order, // then there is no guarantee that the new offset will be larger than the old offset // for movement in positive direction (vice versue for movement in negative direction). // As a result, there is the possiblity that the offset calculation may result in @@ -323,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 Number, Number, integral_constant direction) { constexpr auto IDim = Number{}; - constexpr index_t idim = IDim.Get(); + constexpr index_t idim = IDim; static_if{}([&](auto fwd) { // logic for a merged dimension, also works for non-merged dimension, but its logic may @@ -350,8 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto I = decltype(I_){}; constexpr index_t idim_original = src_partial_original_dims.Get(I); - mThreadSrcOriginalMultiId(idim_original) = - new_src_partial_original_multi_id[I.Get()]; + mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I]; }); // calculate new partial offset on this merged dimension diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp index 2de6cf1d45..55e7fb8f69 100644 --- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp @@ -49,7 +49,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 make_ConstantTensorDescriptor_packed(thread_cluster_lengths); // sanity check: data type - static_assert(is_same::value, "wrong! only support float for now!\n"); + static_assert(is_same{}, "wrong! only support float for now!\n"); // sanity check: nDim static_assert(SrcDesc::GetNumOfDimension() == nDim && @@ -121,12 +121,11 @@ struct BlockwiseTensorSliceReorderCopy_v3 reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster); static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr auto I = decltype(IDim){}; - constexpr index_t i = I.Get(); + constexpr index_t idim = IDim; // compiler: will it really compute index here, or be merged with // GetOffsetFromMultiIndex and // optimized away??? - src_data_multi_id(i) *= src_sub_lengths.Get(I); + src_data_multi_id(idim) *= src_sub_lengths.Get(IDim); }); // compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex diff --git a/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp b/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp index a8d0398de1..7bf340f162 100644 --- a/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp @@ -26,16 +26,16 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi constexpr index_t nshift = NShift::mValue; constexpr index_t did0_end = - is_same::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0); + is_same{} ? desc.GetLength(I0) - nshift : desc.GetLength(I0); constexpr index_t did1_end = - is_same::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1); + is_same{} ? desc.GetLength(I1) - nshift : desc.GetLength(I1); constexpr index_t did2_end = - is_same::value ? desc.GetLength(I2) - nshift : desc.GetLength(I2); + is_same{} ? desc.GetLength(I2) - nshift : desc.GetLength(I2); constexpr index_t did3_end = - is_same::value ? desc.GetLength(I3) - nshift : desc.GetLength(I3); + is_same{} ? desc.GetLength(I3) - nshift : desc.GetLength(I3); for(index_t did0 = 0; did0 < did0_end; ++did0) { diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp index ea77027c10..d77ad2bbee 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp @@ -71,24 +71,7 @@ __device__ void threadwise_gemm(MatrixA, integral_constant, FloatC* __restrict__ p_c_thread) { -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - printf("p_a_thread: %f %f %f %f\n", - p_a_thread[0], - p_a_thread[1], - p_a_thread[2], - p_a_thread[3]); - printf("p_b_thread: %f %f %f %f\n", - p_b_thread[0], - p_b_thread[1], - p_b_thread[2], - p_b_thread[3]); - } -#endif - - if(TransA && (!TransB) && (!TransC)) - { + static_if{}([&](auto fwd) { constexpr auto a_mtx = MatrixA{}; constexpr auto b_mtx = MatrixB{}; constexpr auto c_mtx = MatrixC{}; @@ -111,12 +94,10 @@ __device__ void threadwise_gemm(MatrixA, } } } - } - else - { + }).Else([&](auto fwd) { // not implemented - assert(false); - } + static_assert(fwd(false), "wrong! support for this config is not implemented"); + }); } } // namespace ck diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 2dd7b79ab2..16e1e7a153 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -5,6 +5,10 @@ #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 +#endif + namespace ck { template ::value, "wrong! map is not valid"); -#if 0 - // doesn't compile, because merged-tensor reordering is not implemented - // TODO: implement tensor desc ops for merged-tensor - constexpr auto src_strides_in_access_order = - SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number{}); + // TODO: do more sanity-check here, something like: + // constexpr auto src_strides_in_access_order = + // SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number{}); - constexpr auto dst_strides_in_access_order = - SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number{}); + // constexpr auto dst_strides_in_access_order = + // SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number{}); - // check src/dst stride on the lowest access dimension - static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) && - (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1), - "wrong! src/dst stride on the lowest access dimension needs to be 1 for " - "vectorized read/write"); -#endif + // // check src/dst stride on the lowest access dimension + // static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) && + // (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1), + // "wrong! src/dst stride on the lowest access dimension needs to be 1 for " + // "vectorized read/write"); constexpr auto slice_lengths_in_access_order = SliceLengths::ReorderGivenNew2Old(DimAccessOrder{}); @@ -64,24 +65,7 @@ __device__ void threadwise_generic_tensor_slice_copy_v1( using vector_t = typename vector_type::MemoryType; -#if 1 - ford{}([&](auto access_multi_id) { - auto data_multi_id_in_access_order = access_multi_id; - data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess; - - const auto data_multi_id = - reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); - - const index_t src_index = - SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); - - const index_t dst_index = - DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); - - *reinterpret_cast(&p_dst[dst_index]) = - *reinterpret_cast(&p_src[src_index]); - }); -#else +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 static_ford{}([&](auto access_multi_id) { constexpr index_t itmp = access_multi_id.Back() * DataPerAccess; @@ -97,6 +81,23 @@ __device__ void threadwise_generic_tensor_slice_copy_v1( const index_t dst_index = DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); + *reinterpret_cast(&p_dst[dst_index]) = + *reinterpret_cast(&p_src[src_index]); + }); +#else + ford{}([&](auto access_multi_id) { + auto data_multi_id_in_access_order = access_multi_id; + data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess; + + const auto data_multi_id = + reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); + + const index_t src_index = + SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); + + const index_t dst_index = + DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); + *reinterpret_cast(&p_dst[dst_index]) = *reinterpret_cast(&p_src[src_index]); }); diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp index 8d0ce26f94..ac0b500b15 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp @@ -56,7 +56,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc, static_ford{}([=](auto Ids) { static_for<0, nRead, 1>{}([&](auto IRead) { - constexpr auto multi_id = decltype(Ids){}.PushBack(Number{}); + constexpr auto multi_id = decltype(Ids){}.PushBack(Number{}); const index_t src_index = src_desc.GetOffsetFromMultiIndex(multi_id); @@ -177,8 +177,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc, // pack data static_for<0, DstDataPerWrite, 1>{}([&](auto IDstData) { - const auto dst_multi_id = - ids.PushBack(IWrite.Get() * DstDataPerWrite + IDstData.Get()); + const auto dst_multi_id = ids.PushBack(IWrite * DstDataPerWrite + IDstData); const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{}); @@ -189,7 +188,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc, }); // write data - const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite); + const auto dst_multi_id = ids.PushBack(IWrite * DstDataPerWrite); const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id); diff --git a/composable_kernel/include/utility/Array.hpp b/composable_kernel/include/utility/Array.hpp index fcf87c5843..f33fa516e2 100644 --- a/composable_kernel/include/utility/Array.hpp +++ b/composable_kernel/include/utility/Array.hpp @@ -98,7 +98,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array>::value, "wrong! invalid reorder map"); + static_assert(is_valid_sequence_map>{}, "wrong! invalid reorder map"); return Array{old_array[IRs]...}; } diff --git a/composable_kernel/include/utility/Sequence.hpp b/composable_kernel/include/utility/Sequence.hpp index 09d67c13fa..5c566503a6 100644 --- a/composable_kernel/include/utility/Sequence.hpp +++ b/composable_kernel/include/utility/Sequence.hpp @@ -55,22 +55,6 @@ struct Sequence return Sequence{})...>{}; } -#if 0 // require sequence_sort, which is not implemented yet - template - __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/) - { - static_assert(sizeof...(Is) == MapOld2New::GetSize(), - "wrong! reorder map should have the same size as Sequence to be rerodered"); - - static_assert(is_valid_sequence_map::value, - "wrong! invalid reorder map"); - - constexpr auto map_new2old = typename sequence_map_inverse::SeqMapType{}; - - return ReorderGivenNew2Old(map_new2old); - } -#endif - __host__ __device__ static constexpr auto Reverse(); __host__ __device__ static constexpr index_t Front() @@ -263,74 +247,15 @@ struct sequence_reverse> using SeqType = Sequence; }; -#if 0 // not fully implemented -template -struct sequence_sort_merge_impl; - -template -struct sequence_sort_merge_impl, - Sequence, - Sequence, - Sequence> -{ -}; - -template -struct sequence_sort; - -template -struct sequence_sort> -{ - using OriginalSeqType = Sequence; - using SortedSeqType = xxxxx; - using MapSorted2OriginalType = xxx; -}; - -template -struct sequence_map_inverse_impl; - -// impl for valid map, no impl for invalid map -template -struct sequence_map_inverse_impl, true> -{ - using SeqMapType = sequence_sort>::MapSorted2OriginalType; -}; - -template -struct sequence_map_inverse; - -template -struct sequence_map_inverse> -{ - // TODO: make sure the map to be inversed is valid: [0, sizeof...(Is)) - static constexpr bool is_valid_sequence_map = - is_same>::SortedSeqType, - typename arithmetic_sequence_gen<0, sizeof...(Is), 1>::SeqType>::value; - - // make compiler fails, if is_valid_map != true - using SeqMapType = - typename sequence_map_inverse_impl, is_valid_map>::SeqMapType; -}; - -#endif - template struct is_valid_sequence_map { - static constexpr bool value = -#if 0 // sequence_sort is not implemented yet - is_same::SeqType, - typename sequence_sort::SortedSeqType>::value; -#else - true; -#endif + static constexpr bool value = true; + + // TODO: add proper check for is_valid, something like: + // static constexpr bool value = + // is_same::SeqType, + // typename sequence_sort::SortedSeqType>{}; }; template diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index e82cd62db8..6400fb708f 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -3,91 +3,8 @@ #include "vector_type.hpp" -#define NO_VM_WAIT 0 -#define NO_LGKM_WAIT 0 -#define NO_DS_READ 0 -#define NO_DS_WRITE 0 -#define NO_GLB_READ 0 - namespace ck { -// cast a pointer of LDS to its address -extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; - -__device__ void vmcnt(index_t cnt) -{ -#if !NO_VM_WAIT - if(cnt == 0) - { - asm volatile("\n \ - s_waitcnt vmcnt(0) \n \ - " ::); - } - else if(cnt == 1) - { - asm volatile("\n \ - s_waitcnt vmcnt(1) \n \ - " ::); - } - else if(cnt == 2) - { - asm volatile("\n \ - s_waitcnt vmcnt(2) \n \ - " ::); - } - else if(cnt == 4) - { - asm volatile("\n \ - s_waitcnt vmcnt(2) \n \ - " ::); - } - else - { - assert(false); - } -#endif -} - -__device__ void lgkmcnt(index_t cnt) -{ -#if !NO_LGKM_WAIT - if(cnt == 0) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(0) \n \ - " ::); - } - else if(cnt == 1) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(1) \n \ - " ::); - } - else if(cnt == 2) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(2) \n \ - " ::); - } - else if(cnt == 3) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(3) \n \ - " ::); - } - else if(cnt == 4) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(4) \n \ - " ::); - } - else - { - assert(false); - } -#endif -} - __device__ void outerProduct1x4(const float* a, const float* b, float* c) { asm volatile("\n \ @@ -112,21 +29,7 @@ __device__ void outerProduct1x4(const float& a, const vector_type::MemoryType& b, vector_type::MemoryType& c) { -#if 0 - asm volatile( - "\n \ - v_mac_f32 %0, %4, %5 \n \ - v_mac_f32 %1, %4, %6 \n \ - v_mac_f32 %2, %4, %7 \n \ - v_mac_f32 %3, %4, %8 \n \ - " - : - :"v"(c.x),"v"(c.y),"v"(c.z),"v"(c.w), \ - "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w) - ); -#else outerProduct1x4(&a, (float*)&b, (float*)&c); -#endif } __device__ void outerProduct4x4(const vector_type::MemoryType& a, @@ -136,57 +39,10 @@ __device__ void outerProduct4x4(const vector_type::MemoryType& a, vector_type::MemoryType& c2, vector_type::MemoryType& c3) { -#if 0 - asm volatile( - "\n \ - v_mac_f32 %0, %4, %5 \n \ - v_mac_f32 %1, %4, %6 \n \ - v_mac_f32 %2, %4, %7 \n \ - v_mac_f32 %3, %4, %8 \n \ - " - : - :"v"(c0.x),"v"(c0.y),"v"(c0.z),"v"(c0.w), \ - "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w) - ); - asm volatile( - "\n \ - v_mac_f32 %0, %4, %5 \n \ - v_mac_f32 %1, %4, %6 \n \ - v_mac_f32 %2, %4, %7 \n \ - v_mac_f32 %3, %4, %8 \n \ - " - : - :"v"(c1.x),"v"(c1.y),"v"(c1.z),"v"(c1.w), \ - "v"(a.y),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w) - ); - asm volatile( - "\n \ - v_mac_f32 %0, %4, %5 \n \ - v_mac_f32 %1, %4, %6 \n \ - v_mac_f32 %2, %4, %7 \n \ - v_mac_f32 %3, %4, %8 \n \ - " - : - :"v"(c2.x),"v"(c2.y),"v"(c2.z),"v"(c2.w), \ - "v"(a.z),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w) - ); - asm volatile( - "\n \ - v_mac_f32 %0, %4, %5 \n \ - v_mac_f32 %1, %4, %6 \n \ - v_mac_f32 %2, %4, %7 \n \ - v_mac_f32 %3, %4, %8 \n \ - " - : - :"v"(c3.x),"v"(c3.y),"v"(c3.z),"v"(c3.w), \ - "v"(a.w),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w) - ); -#else outerProduct1x4(a.x, b, c0); outerProduct1x4(a.y, b, c1); outerProduct1x4(a.z, b, c2); outerProduct1x4(a.w, b, c3); -#endif } __device__ void outerProduct8x8(const vector_type::MemoryType* a, @@ -201,7 +57,6 @@ __device__ void outerProduct8x8(const vector_type::MemoryType* a, __device__ void ds_read_b128(vector_type::MemoryType& r, void* lds, index_t offset = 0) { -#if !NO_DS_READ if(offset == 0) { asm volatile("\n \ @@ -722,33 +577,11 @@ __device__ void ds_read_b128(vector_type::MemoryType& r, void* lds, in : "=v"(r) : "v"(__to_local(lds))); } -#endif -} - -__device__ void global_load(vector_type::MemoryType& r, - const vector_type::MemoryType* ptr, - index_t offset = 0) -{ -#if !NO_GLB_READ - if(offset == 0) - { - asm volatile("\n \ - global_load_dwordx4 %0, %1, off \n \ - " - : "=v"(r) - : "v"(ptr)); - } - else - { - assert(false); - } -#endif } __device__ void ds_write_b128(const vector_type::MemoryType& r, void* lds, index_t offset = 0) { -#if !NO_DS_WRITE if(offset == 0) { asm volatile("\n \ @@ -761,7 +594,6 @@ ds_write_b128(const vector_type::MemoryType& r, void* lds, index_t off { assert(false); } -#endif } } // namespace ck diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 4d840df7fe..babdbc47cf 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -7,6 +7,9 @@ #include "hip/hip_fp16.h" #define CK_USE_AMD_INLINE_ASM 1 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 + namespace ck { // For some reason, HIP compiler need this definition to generate optimal load and store diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 0f9e3441ea..39afc0b154 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -9,6 +9,9 @@ #include "helper_cuda.h" #define CK_USE_AMD_INLINE_ASM 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 + namespace ck { // For some reason, CUDA need this definition, otherwise diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp index 920f11af2a..56bdc4c06b 100644 --- a/composable_kernel/include/utility/functional.hpp +++ b/composable_kernel/include/utility/functional.hpp @@ -24,10 +24,8 @@ struct swallow }; // Emulate if constexpr -template -struct static_if -{ -}; +template +struct static_if; template <> struct static_if diff --git a/composable_kernel/include/utility/integral_constant.hpp b/composable_kernel/include/utility/integral_constant.hpp index 005d08e0c9..6038b1b203 100644 --- a/composable_kernel/include/utility/integral_constant.hpp +++ b/composable_kernel/include/utility/integral_constant.hpp @@ -1,15 +1,12 @@ #ifndef CK_INTEGRAL_CONSTANT_HPP #define CK_INTEGRAL_CONSTANT_HPP +#include + namespace ck { -template -struct integral_constant -{ - static const T value = N; - - __host__ __device__ constexpr T Get() const { return value; } -}; +template +using integral_constant = std::integral_constant; template __host__ __device__ constexpr auto operator+(integral_constant, integral_constant) @@ -17,6 +14,12 @@ __host__ __device__ constexpr auto operator+(integral_constant, integral_c return integral_constant{}; } +template +__host__ __device__ constexpr auto operator*(integral_constant, integral_constant) +{ + return integral_constant{}; +} + template using Number = integral_constant; diff --git a/composable_kernel/include/utility/utility.hpp b/composable_kernel/include/utility/utility.hpp index 9d32c2be55..c744e27ddd 100644 --- a/composable_kernel/include/utility/utility.hpp +++ b/composable_kernel/include/utility/utility.hpp @@ -1,6 +1,7 @@ #ifndef CK_UTILITY_HPP #define CK_UTILITY_HPP +#include #include "config.hpp" namespace ck { @@ -9,23 +10,8 @@ __device__ index_t get_thread_local_1d_id() { return threadIdx.x; } __device__ index_t get_block_1d_id() { return blockIdx.x; } -template -struct is_same -{ - static constexpr bool value = false; -}; - -template -struct is_same -{ - static constexpr bool value = true; -}; - template -__host__ __device__ constexpr bool is_same_type(X, Y) -{ - return is_same::value; -} +using is_same = std::is_same; namespace math { @@ -58,7 +44,7 @@ struct integer_divide_ceiler { __host__ __device__ constexpr T operator()(T a, T b) const { - static_assert(is_same::value || is_same::value, "wrong type"); + static_assert(is_same{} || is_same{}, "wrong type"); return (a + b - 1) / b; } @@ -67,7 +53,7 @@ struct integer_divide_ceiler template __host__ __device__ constexpr T integer_divide_ceil(T a, T b) { - static_assert(is_same::value || is_same::value, "wrong type"); + static_assert(is_same{} || is_same{}, "wrong type"); return (a + b - 1) / b; } @@ -85,7 +71,7 @@ __host__ __device__ constexpr T max(T x, Ts... xs) auto y = max(xs...); - static_assert(is_same::value, "not the same type"); + static_assert(is_same{}, "not the same type"); return x > y ? x : y; } @@ -103,12 +89,12 @@ __host__ __device__ constexpr T min(T x, Ts... xs) auto y = min(xs...); - static_assert(is_same::value, "not the same type"); + static_assert(is_same{}, "not the same type"); return x < y ? x : y; } -// this is wrong +// this is WRONG // TODO: implement least common multiple properly, instead of calling max() template __host__ __device__ constexpr T lcm(T x, Ts... xs) diff --git a/composable_kernel/include/utility/vector_type.hpp b/composable_kernel/include/utility/vector_type.hpp index 72c73068e1..2b33887ffb 100644 --- a/composable_kernel/include/utility/vector_type.hpp +++ b/composable_kernel/include/utility/vector_type.hpp @@ -64,131 +64,6 @@ struct vector_type } }; -#if 0 -template <> -struct vector_type -{ - using MemoryType = half; - - __host__ __device__ static MemoryType Pack(half s) { return s; } -}; - -template <> -struct vector_type -{ - using MemoryType = half2; - - __host__ __device__ static MemoryType Pack(half s0, half s1) - { - union - { - MemoryType vector; - half scalar[2]; - } data; - - data.scalar[0] = s0; - data.scalar[1] = s1; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using MemoryType = float2; -}; - -template <> -struct vector_type -{ - using MemoryType = float4; -}; - -template <> -struct vector_type -{ - using MemoryType = char; - - __host__ __device__ static MemoryType Pack(char s) { return s; } -}; - -template <> -struct vector_type -{ - using MemoryType = int16_t; - - __host__ __device__ static MemoryType Pack(char s0, char s1) - { - union - { - MemoryType vector; - char scalar[2]; - } data; - - data.scalar[0] = s0; - data.scalar[1] = s1; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using MemoryType = int32_t; - - __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3) - { - union - { - MemoryType vector; - char scalar[4]; - } data; - - data.scalar[0] = s0; - data.scalar[1] = s1; - data.scalar[2] = s2; - data.scalar[3] = s3; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using MemoryType = int64_t; -}; - -template <> -struct vector_type -{ - using MemoryType = int64_t; -}; - -template <> -struct vector_type -{ - using MemoryType = char4; -}; - -template <> -struct vector_type -{ - using MemoryType = int64_t; -}; - -template <> -struct vector_type -{ - using MemoryType = int; -}; - -template <> -struct vector_type -{ - using MemoryType = int64_t; -}; -#endif - } // namespace ck #endif diff --git a/driver/include/tensor.hpp b/driver/include/tensor.hpp index ff537e44fe..e7001b1022 100644 --- a/driver/include/tensor.hpp +++ b/driver/include/tensor.hpp @@ -46,7 +46,7 @@ auto call_f_unpack_args_impl(F f, T args, std::index_sequence) template auto call_f_unpack_args(F f, T args) { - constexpr std::size_t N = std::tuple_size::value; + constexpr std::size_t N = std::tuple_size{}; return call_f_unpack_args_impl(f, args, std::make_index_sequence{}); } @@ -60,7 +60,7 @@ auto construct_f_unpack_args_impl(T args, std::index_sequence) template auto construct_f_unpack_args(F, T args) { - constexpr std::size_t N = std::tuple_size::value; + constexpr std::size_t N = std::tuple_size{}; return construct_f_unpack_args_impl(args, std::make_index_sequence{}); }