From e55cfe1536e44cccfe814f29a6b1f37255d02b3b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 24 Jun 2019 11:49:13 -0500 Subject: [PATCH] debugging vector load for generic tensor copy --- ...lution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp | 40 +++++++++++++++++-- driver/src/driver.cpp | 3 ++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp index 94d8f203f1..c8f9d17648 100644 --- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp @@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, constexpr index_t BPerBlock = 16; constexpr index_t KPerBlock = 128; - constexpr index_t CPerBlock = 8; + constexpr index_t EPerBlock = 8; constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; @@ -98,7 +98,41 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, constexpr index_t BPerBlock = 16; constexpr index_t KPerBlock = 128; - constexpr index_t CPerBlock = 8; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 4, 1>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 4, 4>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] + + constexpr index_t InBlockCopySrcDataPerRead_B = 4; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 1 + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 16; + constexpr index_t KPerBlock = 128; + constexpr index_t EPerBlock = 8; constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; @@ -152,7 +186,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, ConvDilations, BPerBlock, KPerBlock, - CPerBlock, + EPerBlock, N1, N2, GemmMPerThreadSubC, diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 8826b62ab3..d2c5b607b8 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -566,6 +566,9 @@ int main(int argc, char* argv[]) constexpr index_t Y = 1; constexpr index_t X = 1; + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + constexpr index_t HPad = 0; constexpr index_t WPad = 0; #elif 0