diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index fba11478f8..0f9976f453 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -487,6 +487,7 @@ struct BlockwiseGenericTensorSliceCopy_v2 #if 0 mThreadwiseLoad.Run(p_src, p_buffer); #else + // hardcoded: global to register mThreadwiseLoad.template Run_amd_experiment(p_src, p_buffer); #endif } @@ -497,7 +498,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 #if 0 mThreadwiseStore.Run(p_buffer, p_dst); #else - mThreadwiseStore.template Run_amd_experiment(p_buffer, p_dst); + // hardcoded: register to LDS + mThreadwiseStore.template Run_amd_experiment(p_buffer, p_dst); #endif } @@ -506,13 +508,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 { TData p_buffer[GetRegisterBufferSize()]; -#if 0 - mThreadwiseLoad.Run(p_src, p_buffer); - mThreadwiseStore.Run(p_buffer, p_dst); -#else - mThreadwiseLoad.template Run_amd_experiment(p_src, p_buffer); - mThreadwiseStore.template Run_amd_experiment(p_buffer, p_dst); -#endif + RunLoadRegisterBuffer(p_src, p_buffer); + RunStoreRegisterBuffer(p_buffer, p_dst); } template diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 097ac78a8a..e661264d86 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -819,38 +819,38 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 src_vector_t vector_data; + // Read vector from src. + // 1. Source code version can take src of all kinds of memory-space + // 2. Inline asm versions using global_load or buffer_load can only take + // src from global-memory + // + // Commemt for loading from global-memory: + // When + // 1) using source code, in order for compiler to emit optimal + // load instruction, or + // 2) using inline asm (global_load or buffer_load), in order + // for inline asm to be valid, + // following assumptions need to be satisfied: + // 1. p_src need to be block-invariant (assumption) + // 2. src_normal_offset must be calculatd at compile time (guaranteed) + // 3. src_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { #if 1 // source code - // Load vector from src. - // src can be all kinds of memory-space. - // In order for optimized global_load to be emitted by compiler, need to - // assume: - // 1. p_src need to be block-invariant (assumption) - // 2. src_normal_offset must be calculatd at compile time (guaranteed) - // 3. src_merged_offset can be runtime value (no assumption imposed) vector_data = *reinterpret_cast( &p_src[src_normal_offset + src_merged_offset]); -#else // inline asm using buffer_load - // Load vector from src - // src's memory-space can only be global-memory (buffer_load inline-asm is - // used) - // In order for buffer_load to be valid, need to assume: - // 1. p_src need to be block-invariant (assumption) - // 2. src_normal_offset must be calculatd at compile time (guaranteed) - // 3. src_merged_offset can be runtime value (no assumption imposed) - vector_data = buffer_load( +#elif 1 // inline asm using global_load + vector_data = __global_load( + p_src, + static_cast(src_merged_offset), + static_cast(src_normal_offset)); +#elif 1 // inline asm using buffer_load + vector_data = __buffer_load( p_src, static_cast(src_merged_offset), static_cast(src_normal_offset)); #endif }).Else([&](auto) { - // Load vector from src. // src can be all kinds of memory-space. - // In order for optimized global_load to be emitted by compiler, need to - // assume: - // 1. p_src need to be block-invariant (assumption) - // 2. src_normal_offset must be calculatd at compile time (guaranteed) - // 3. src_merged_offset can be runtime value (no assumption imposed) vector_data = *reinterpret_cast( &p_src[src_normal_offset + src_merged_offset]); }); @@ -924,36 +924,34 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 const index_t dst_normal_offset = DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); + // Write vector into dst. + // 1. Source code version can take dst of all kinds of memory-space + // 2. Inline asm versions using global_store or buffer_store can only take + // dst from global-memory + // + // Commemt for storing into global-memory: + // When + // 1) using source code, in order for compiler to emit optimal + // store instruction, or + // 2) using inline asm (global_store or buffer_store), in order + // for inline asm to be valid, + // following assumptions need to be satisfied: + // 1. p_dst need to be block-invariant (assumption) + // 2. dst_normal_offset must be calculatd at compile time (guaranteed) + // 3. dst_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { #if 1 // source code - // Write vector into dst. - // dst can be all kinds of memory-space - // In order for optmized global_store to be emitted by compiler, need to - // assume: - // 1. p_dst need to be block-invariant (assumption) - // 2. dst_normal_offset must be calculatd at compile time (guaranteed) - // 3. dst_merged_offset can be runtime value (no assumption imposed) *reinterpret_cast( &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; -#else // inline asm using buffer_store - // Write vector into dst. - // dst's memory-space need to be global-memory (buffer_store is used) - // In order for optmized global_store to be emitted by compiler, need to - // assume: - // 1. p_dst need to be block-invariant (assumption) - // 2. dst_normal_offset must be calculatd at compile time (guaranteed) - // 3. dst_merged_offset can be runtime value (no assumption imposed) - buffer_store( +#elif 1 // inline asm using global_store + __global_store( + vector_data, p_dst, dst_merged_offset, dst_normal_offset); +#elif 1 // inline asm using buffer_store + __buffer_store( vector_data, p_dst, dst_merged_offset, dst_normal_offset); #endif }).Else([&](auto) { - // Write vector into dst. // dst can be all kinds of memory-space - // In order for optmized global_store to be emitted by compiler, need to - // assume: - // 1. p_dst need to be block-invariant (assumption) - // 2. dst_normal_offset must be calculatd at compile time (guaranteed) - // 3. dst_merged_offset can be runtime value (no assumption imposed) *reinterpret_cast( &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; }); diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 307c96c4a4..04b6864fa4 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -8,21 +8,169 @@ namespace ck { // cast a pointer of LDS to its address extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); -// buffer_load and buffer_store +// global_load and global_store template __device__ typename vector_type::MemoryType -buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset); +__global_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset); template -__device__ void buffer_store(const typename vector_type::MemoryType& src, - T* p_dst_block, - uint32_t dst_thread_offset, - uint32_t dst_const_offset); +__device__ void __global_store(const typename vector_type::MemoryType& src, + T* p_dst_block, + uint32_t dst_thread_offset, + uint32_t dst_const_offset); template <> -__device__ float buffer_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) +__device__ float __global_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) +{ +#if 0 // compute on VALU + float dst; + + uint64_t src_thread_offset_u64 = static_cast(src_thread_offset + src_const_offset); + + asm volatile("\n \ + global_load_dword %0, %1, %2, offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset_u64), "s"(p_src_block)); + + return dst; +#else // compute on SALU + float dst; + + uint64_t src_thread_offset_u64 = static_cast(src_thread_offset); + + const float* p_src_block_with_offset = p_src_block + src_const_offset; + + asm volatile("\n \ + global_load_dword %0, %1, %2, offset:0 \n \ + ;;s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset)); + + return dst; +#endif +} + +template <> +__device__ vector_type::MemoryType __global_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) +{ +#if 0 // compute on VALU + vector_type::MemoryType dst; + + uint64_t src_thread_offset_u64 = static_cast(src_thread_offset + src_const_offset); + + asm volatile("\n \ + global_load_dwordx2 %0, %1, %2, offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset_u64), "s"(p_src_block)); + + return dst; +#else // compute on SALU + vector_type::MemoryType dst; + + uint64_t src_thread_offset_u64 = static_cast(src_thread_offset); + + const float* p_src_block_with_offset = p_src_block + src_const_offset; + + asm volatile("\n \ + global_load_dwordx2 %0, %1, %2, offset:0 \n \ + ;;s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset)); + + return dst; +#endif +} + +template <> +__device__ vector_type::MemoryType __global_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) +{ +#if 0 // compute on VALU + vector_type::MemoryType dst; + + uint64_t src_thread_offset_u64 = static_cast(src_thread_offset + src_const_offset); + + asm volatile("\n \ + global_load_dwordx4 %0, %1, %2, offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset_u64), "s"(p_src_block)); + + return dst; +#else // compute on SALU + vector_type::MemoryType dst; + + uint64_t src_thread_offset_u64 = static_cast(src_thread_offset); + + const float* p_src_block_with_offset = p_src_block + src_const_offset; + + asm volatile("\n \ + global_load_dwordx4 %0, %1, %2, offset:0 \n \ + ;;s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset)); + + return dst; +#endif +} + +template <> +__device__ void __global_store(const float& src, + float* p_dst_block, + uint32_t dst_thread_offset, + uint32_t dst_const_offset) +{ +#if 0 // compute on VALU + uint64_t dst_thread_offset_u64 = static_cast(dst_thread_offset + dst_const_offset); + + asm volatile("\n \ + global_store_dword %0, %1, %2, offset:0 \n \ + s_waitcnt 0 \n \ + " + : + : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block)); +#else // compute on SALU + uint64_t dst_thread_offset_u64 = static_cast(dst_thread_offset); + + float* p_dst_block_with_offset = p_dst_block + dst_const_offset; + + asm volatile("\n \ + global_store_dword %0, %1, %2, offset:0 \n \ + ;;s_waitcnt 0 \n \ + " + : + : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block_with_offset)); +#endif +} + +// __buffer_load and __buffer_store +template +__device__ typename vector_type::MemoryType +__buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset); + +template +__device__ void __buffer_store(const typename vector_type::MemoryType& src, + T* p_dst_block, + uint32_t dst_thread_offset, + uint32_t dst_const_offset); + +template <> +__device__ float __buffer_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) { float dst; @@ -35,7 +183,7 @@ __device__ float buffer_load(const float* p_src_block, reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ - buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ + __buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ s_waitcnt 0 \n \ " : "=v"(dst) @@ -45,9 +193,9 @@ __device__ float buffer_load(const float* p_src_block, } template <> -__device__ vector_type::MemoryType buffer_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) +__device__ vector_type::MemoryType __buffer_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) { vector_type::MemoryType dst; @@ -60,7 +208,7 @@ __device__ vector_type::MemoryType buffer_load(const float* reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ - buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ + __buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ s_waitcnt 0 \n \ " : "=v"(dst) @@ -70,9 +218,9 @@ __device__ vector_type::MemoryType buffer_load(const float* } template <> -__device__ vector_type::MemoryType buffer_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) +__device__ vector_type::MemoryType __buffer_load(const float* p_src_block, + uint32_t src_thread_offset, + uint32_t src_const_offset) { vector_type::MemoryType dst; @@ -85,7 +233,7 @@ __device__ vector_type::MemoryType buffer_load(const float* reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ - buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ + __buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ s_waitcnt 0 \n \ " : "=v"(dst) @@ -95,10 +243,10 @@ __device__ vector_type::MemoryType buffer_load(const float* } template <> -__device__ void buffer_store(const float& src, - float* p_dst_block, - uint32_t dst_thread_offset, - uint32_t dst_const_offset) +__device__ void __buffer_store(const float& src, + float* p_dst_block, + uint32_t dst_thread_offset, + uint32_t dst_const_offset) { int32x4_t dst_block_setting{0}; // fill in byte 0 - 1 @@ -109,7 +257,7 @@ __device__ void buffer_store(const float& src, reinterpret_cast(&dst_block_setting)[3] = 0x00027000; asm volatile("\n \ - buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ + __buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ s_waitcnt 0 \n \ " : diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 4555ce873f..febd1625e2 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data constexpr index_t BlockSize = 256; @@ -82,7 +82,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #elif 0 // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data @@ -156,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 93a871b83b..166f9382fe 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -103,7 +103,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128;