diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index 965f84b612..e550581fc4 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -51,8 +51,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 - // each thread hold 64 data +#if 0 + // BlockSize = 256, each thread hold 64 data constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; @@ -86,6 +86,43 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 1 + // BlockSize = 64, each thread hold 64 data + constexpr index_t BlockSize = 64; + + constexpr index_t BPerBlock = 8; + constexpr index_t KPerBlock = 64; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmNRepeat = 2; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 2; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 2, 1, 4>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 1, 8, 1>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [E, B, N1, N2] + using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] + + constexpr index_t InBlockCopySrcDataPerRead_B = 1; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 2>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 32>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 8ae1b3eb35..94c59c478c 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -311,6 +311,51 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; +#elif 0 + // 5x5 filter, 2x2 pad, 7x7 input + constexpr index_t N = 128; + constexpr index_t C = 48; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 5; + constexpr index_t X = 5; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<2, 2>; + using RightPads = Sequence<2, 2>; +#elif 0 + // 7x1 filter, 3x0 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 7; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<3, 0>; + using RightPads = Sequence<3, 0>; +#elif 1 + // 1x7 filter, 0x3 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 7; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 3>; + using RightPads = Sequence<0, 3>; #endif auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); diff --git a/script/compile-hip.sh b/script/compile-hip.sh index 51d1bed797..5a61bc1387 100755 --- a/script/compile-hip.sh +++ b/script/compile-hip.sh @@ -1,8 +1,8 @@ #!/bin/bash export KMDUMPISA=1 export KMDUMPLLVM=1 - export KMOPTLLC="-mattr=+enable-ds128" -#export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr" +#export KMOPTLLC="-mattr=+enable-ds128" + export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr" make -j driver /opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm