From 740149fcf1708a4a023f4d951629cd32aa2c3f3e Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 13 Aug 2019 17:26:00 -0500 Subject: [PATCH] clean up --- ...tion_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp | 34 +++- ..._v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp | 59 +++---- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 6 +- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 6 +- ..._v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp | 6 +- ..._v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp | 6 +- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 6 +- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 6 +- .../ConstantMatrixDescriptor.hpp | 6 +- .../blockwise_batched_gemm.hpp | 145 +----------------- .../include/utility/config_amd.hpp.in | 2 - .../include/utility/config_nvidia.hpp.in | 2 - driver/src/driver.cpp | 2 +- 13 files changed, 71 insertions(+), 215 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp index 619faaf094..1e20a7534d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp @@ -126,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn // blockwise copy // input: format is [C, Hi, Wi, N] auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2::type, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>( + make_zero_array(), make_zero_array()) + .Run(p_out_thread, p_out_thread_on_global); +#elif 0 + ThreadwiseGenericTensorSliceCopy_v1r1::type, @@ -328,6 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn OutThreadCopyDataPerAccess_N>( make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); +#endif }).Else([&](auto fwd) { static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -375,7 +387,18 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn wo_block_data_begin + wo_thread_data_begin, n_block_data_begin + n_thread_data_begin); - ThreadwiseGenericTensorSliceCopy_v2r1::type, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>( + make_zero_array(), make_zero_array()) + .Run(p_out_thread, p_out_thread_on_global); +#elif 0 + ThreadwiseGenericTensorSliceCopy_v1r1::type, @@ -386,6 +409,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn OutThreadCopyDataPerAccess_N>( make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); +#endif }); } }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp index 6cedb7f02a..4f297fac3d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -7,10 +7,6 @@ #include "blockwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" #include "blockwise_batched_gemm.hpp" -#include "blockwise_2d_tensor_op.hpp" -#include "blockwise_4d_tensor_op.hpp" -#include "threadwise_tensor_slice_copy.hpp" -#include "threadwise_4d_tensor_op.hpp" namespace ck { @@ -133,18 +129,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); -#if 1 // blockwise copy // input: format is [C, Hi, Wi, N] - const auto blockwise_in_copy = - Blockwise4dTensorCopy3{}; -#else auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1({0, 0, 0, 0}, {0, 0, 0, 0}); -#endif -#if 1 // blockwise wei copy // format is [CPerBlock, X * KPerBlock] - const auto blockwise_wei_copy = - Blockwise2dTensorCopy3({0, 0}, {0, 0}); -#else const auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1({0, 0}, {0, 0}); -#endif // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -428,13 +403,16 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer n_block_data_begin + n_thread_data_begin); #if 1 - threadwise_tensor_slice_copy(out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_thread_on_global, - out_10d_thread_desc.GetLengths(), - Number{}); -#else + ThreadwiseGenericTensorSliceCopy_v1r2::type, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>( + make_zero_array(), make_zero_array()) + .Run(p_out_thread, p_out_thread_on_global); +#elif 0 ThreadwiseGenericTensorSliceCopy_v1r1{}); -#else + ThreadwiseGenericTensorSliceCopy_v1r2::type, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>( + make_zero_array(), make_zero_array()) + .Run(p_out_thread, p_out_thread_on_global); +#elif 0 ThreadwiseGenericTensorSliceCopy_v1r1{}; } -template -__host__ __device__ constexpr auto - make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor(TDesc) +template +__host__ __device__ constexpr auto make_ConstantMatrixDescriptor(ConstantTensorDescriptor) { + using TDesc = ConstantTensorDescriptor; static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); static_assert(TDesc::GetStrides()[1] == 1, "wrong"); return ConstantMatrixDescriptor{}, Number{}); + make_ConstantMatrixDescriptor_packed(Number{}, Number{}); constexpr auto b_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); + make_ConstantMatrixDescriptor_packed(Number{}, Number{}); // thread A-sub, B-sub for copy constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( @@ -382,102 +345,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); } - - template - __device__ void Run_asm_v2(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread) const - { - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - constexpr auto c_thread_mtx = ThreadMatrixC{}; - - constexpr index_t M = a_block_mtx.NCol(); - constexpr index_t N = b_block_mtx.NCol(); - constexpr index_t K = a_block_mtx.NRow(); // A is transposed - - constexpr index_t MPerThread = c_thread_mtx.NRow(); - constexpr index_t NPerThread = c_thread_mtx.NCol(); - - // thread A, B for GEMM - // A is transposed, b is not - constexpr auto a_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); - - constexpr auto b_thread_mtx = - make_ConstantMatrixDescriptor(Number{}, Number{}); - - // thread A-sub, B-sub for copy - constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; - FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; - - constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster; - constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; - - // assertion for inline asm - static_assert(is_same{} && is_same{} && - is_same{}, - "Run_amd_asm only deal with float\n"); - - static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && - MPerThread == 8 && NPerThread == 8, - "Run_amd_asm cannot deal with this GEMM shape yet\n"); - - static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n"); - - static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1, - "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == " - "1 for now\n"); - - using Float4 = vector_type::MemoryType; - - Float4* reg_a = (Float4*)(p_a_thread); - Float4* reg_b = (Float4*)(p_b_thread); - Float4* reg_c = (Float4*)(p_c_thread); - - void* a_lds_loc = (void*)(p_a_block + mMyThreadOffsetA); - void* b_lds_loc = (void*)(p_b_block + mMyThreadOffsetB); - - constexpr index_t a_lds_row_stride = sizeof(float) * a_block_mtx.RowStride(); - constexpr index_t b_lds_row_stride = sizeof(float) * b_block_mtx.RowStride(); - constexpr index_t a_lds_cluster_col_stride = sizeof(float) * MPerLevel1Cluster; - constexpr index_t b_lds_cluster_col_stride = sizeof(float) * NPerLevel1Cluster; - - ds_read_b128(reg_a[0], a_lds_loc, 0); - ds_read_b128(reg_b[0], b_lds_loc, 0); - ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride); - ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride); - lgkmcnt(2); - outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); - lgkmcnt(1); - outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); - -#pragma unroll - for(index_t k = 1; k < K; ++k) - { - ds_read_b128(reg_a[0], a_lds_loc, k * a_lds_row_stride); - lgkmcnt(1); - outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); - ds_read_b128(reg_b[0], b_lds_loc, k * b_lds_row_stride); - outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); - ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride + k * b_lds_row_stride); - ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride + k * a_lds_row_stride); - lgkmcnt(2); - outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); - lgkmcnt(1); - outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); - } - - lgkmcnt(0); - outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); - outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); - } #endif template diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index a99f68aba9..a7762a59b4 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -7,10 +7,8 @@ #define CK_DEVICE_BACKEND_AMD 1 #define CK_USE_AMD_INLINE_ASM 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 namespace ck { diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index e9842eda35..0a4b43d1a6 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -9,10 +9,8 @@ #define CK_DEVICE_BACKEND_NVIDIA 1 #define CK_USE_AMD_INLINE_ASM 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 namespace ck { diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 40d6c5b6fe..7ea05e243e 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -71,7 +71,7 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 0 +#if 1 constexpr index_t N = 64; constexpr index_t C = 1536; constexpr index_t HI = 8;