From f6cb5b846d1eff1d1e35ab58273becfd40bd0831 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 6 Apr 2019 15:10:40 -0500 Subject: [PATCH] debugging --- ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 4 ++-- src/include/blockwise_2d_tensor_op.hip.hpp | 2 +- src/include/blockwise_gemm.hip.hpp | 1 + src/include/common.hip.hpp | 4 ++++ src/include/data_type.hip.hpp | 8 ++++++++ ...on_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp | 19 ++----------------- src/include/threadwise_gemm.hip.hpp | 2 -- 7 files changed, 18 insertions(+), 22 deletions(-) diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index e98cc11350..438408ca7e 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t BlockSize = 256; -#elif 1 +#elif 0 // 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer constexpr index_t BPerBlock = 64; constexpr index_t KPerBlock = 128; @@ -266,7 +266,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, for(index_t i = 0; i < nrepeat; ++i) { constexpr auto gridwise_conv = -#if 0 +#if 1 GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn #else GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 2bbf0edd30..3e692226c1 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -1,6 +1,6 @@ #pragma once +#include "common.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp" -#include "inline_asm.hpp" template __device__ void diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index e9a8817e0b..7f25a7322f 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -1,4 +1,5 @@ #pragma once +#include "common.hip.hpp" #include "threadwise_gemm.hip.hpp" // if following number are power of 2, index calculation shall be greatly reduced: diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 6b8c450021..1c41d04feb 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -5,6 +5,10 @@ #include "Array.hip.hpp" #include "functional.hip.hpp" +#if DEVICE_BACKEDN_HIP +#include "inline_asm.hpp" +#endif + __device__ index_t get_thread_local_1d_id() { return threadIdx.x; } __device__ index_t get_block_1d_id() { return blockIdx.x; } diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp index 20d9d91fa5..54bed9ec5a 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/data_type.hip.hpp @@ -15,7 +15,11 @@ struct vector_type template <> struct vector_type { +#if 1 typedef float MemoryType __attribute__((ext_vector_type(2))); +#else + using MemoryType = float2; +#endif __host__ __device__ static MemoryType Pack(float s0, float s1) { @@ -34,7 +38,11 @@ struct vector_type template <> struct vector_type { +#if 1 typedef float MemoryType __attribute__((ext_vector_type(4))); +#else + using MemoryType = float4; +#endif }; #if 0 diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index 48b8298cd3..acd4df9662 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -222,21 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block); -#elif 1 - Float4 tmp_in, tmp_wei; - Float4* glb_in_p = - (Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset); - Float4* loc_in_p = (Float4*)(p_in_block + blockwise_in_copy.mDstMyThreadOffset); - - Float4* glb_wei_p = - (Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset); - Float4* loc_wei_p = (Float4*)(p_wei_block + blockwise_wei_copy.mDstMyThreadOffset); - - global_load(tmp_in, glb_in_p); - global_load(tmp_wei, glb_wei_p); - vmcnt(0); - ds_write_b128(tmp_in, loc_in_p); - ds_write_b128(tmp_wei, loc_wei_p); #endif __syncthreads(); @@ -247,11 +232,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer -#elif 1 +#elif 0 blockwise_gemm.Run_asm #endif (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp index 79fc1bf699..590b4ba1cb 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hip.hpp @@ -1,7 +1,5 @@ #pragma once -#include "inline_asm.hpp" - template __device__ void threadwise_matrix_copy(SrcMatrix, const Float* __restrict__ p_src,