mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
clean up
This commit is contained in:
@@ -107,11 +107,11 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<4, 4, 2, 4>;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
|
||||
#elif 0
|
||||
// for 3x3, 34x34, v1r2, Pascal, in-block-copy1
|
||||
constexpr index_t BlockSize = 128;
|
||||
@@ -137,12 +137,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<0, 0, 0, 0>; // not used
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<0, 0, 0, 0>; // not used
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
|
||||
#elif 1
|
||||
// for 3x3, 34x34, v1r3, Pascal
|
||||
// for 3x3, 28x28, v1r3, Pascal
|
||||
@@ -170,12 +170,15 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 2, 4>;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
using InBlockCopySubLengths_CHWN = Sequence<1, 1, 1, 4>;
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 2, 4>;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
using WeiBlockCopySubLengths_CK = Sequence<2, 4>;
|
||||
using WeiBlockCopyClusterLengths_CK = Sequence<4, 32>;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
|
||||
#elif 0
|
||||
// for 3x3, 34x34, v1r3, Pascal, bad
|
||||
constexpr index_t BlockSize = 128;
|
||||
@@ -201,12 +204,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<2, 2, 32, 1>;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 1;
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<2, 2, 32, 1>;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 1;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 2;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 2;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 1;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 1;
|
||||
#elif 0
|
||||
// for 3x3, 34x34, v1r1, Vega 20
|
||||
constexpr index_t BlockSize = 256;
|
||||
@@ -232,12 +235,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<4, 4, 2, 8>;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 2;
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<4, 4, 2, 8>;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 2;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 2;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 2;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 4;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 4;
|
||||
#elif 1
|
||||
// for 3x3, 34x34, v1r3, Vega 20
|
||||
constexpr index_t BlockSize = 256;
|
||||
@@ -263,12 +266,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 4, 4>;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 4, 4>;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 4;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 4;
|
||||
#elif 0
|
||||
// for 3x3, 56x56, v1r1, Pascal
|
||||
constexpr index_t NPerBlock = 32;
|
||||
@@ -282,13 +285,13 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t HoPerThread = 1;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 1;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 8;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 1;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 8;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
@@ -298,7 +301,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmNLevel1Cluster = 4;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
|
||||
|
||||
constexpr index_t BlockSize = 128;
|
||||
#elif 0
|
||||
@@ -324,14 +327,14 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 1;
|
||||
constexpr index_t GemmDataPerReadB = 1;
|
||||
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 1;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 4;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 1;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 4;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t BlockSize = 128;
|
||||
#elif 0
|
||||
@@ -347,13 +350,13 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t HoPerThread = 1;
|
||||
constexpr index_t WoPerThread = 2;
|
||||
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 1;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 8;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 1;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 8;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
@@ -365,7 +368,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
|
||||
|
||||
constexpr index_t BlockSize = 128;
|
||||
#elif 0
|
||||
@@ -393,12 +396,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<4, 2, 4, 4>;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
using InBlockCopyClusterLengths_CHWN = Sequence<4, 2, 4, 4>;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
|
||||
#elif 0
|
||||
// for 1x1, 28x28, v1r1, Pascal
|
||||
constexpr index_t NPerBlock = 16;
|
||||
@@ -413,13 +416,13 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t HoPerThread = 1;
|
||||
constexpr index_t WoPerThread = 1;
|
||||
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 8;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 4;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 8;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 4;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
@@ -429,7 +432,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmNLevel1Cluster = 4;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
|
||||
|
||||
constexpr index_t BlockSize = 128;
|
||||
#elif 0
|
||||
@@ -453,14 +456,14 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmNLevel1Cluster = 2;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 8;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 4;
|
||||
constexpr index_t InBlockCopyDataPerRead_N = 4;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimC = 8;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimH = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimW = 2;
|
||||
constexpr index_t InBlockCopy_ThreadPerDimN = 4;
|
||||
constexpr index_t InBlockCopyDataPerAccess_N = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 2;
|
||||
|
||||
constexpr index_t BlockSize = 128;
|
||||
#endif
|
||||
@@ -478,9 +481,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
|
||||
#elif 0
|
||||
GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
#elif 1
|
||||
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
#elif 0
|
||||
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
#elif 1
|
||||
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
|
||||
#endif
|
||||
<GridSize,
|
||||
@@ -507,10 +510,13 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
GemmKPerThreadLoop,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB,
|
||||
InBlockCopySubLengths_CHWN,
|
||||
InBlockCopyClusterLengths_CHWN,
|
||||
InBlockCopyDataPerRead_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
OutThreadCopyDataPerWrite_N>{};
|
||||
InBlockCopyDataPerAccess_N,
|
||||
WeiBlockCopySubLengths_CK,
|
||||
WeiBlockCopyClusterLengths_CK,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
OutThreadCopyDataPerAccess_N>{};
|
||||
|
||||
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
|
||||
Reference in New Issue
Block a user