diff --git a/driver/conv.cu b/driver/conv.cu index 195f34bf72..f486d9ad51 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -325,12 +325,20 @@ int main() #if 0 constexpr unsigned N = 1; constexpr unsigned C = 1; - constexpr unsigned HI = 34; - constexpr unsigned WI = 34; + constexpr unsigned HI = 4; + constexpr unsigned WI = 4; constexpr unsigned K = 1; constexpr unsigned S = 3; constexpr unsigned R = 3; -#elif 1 +#elif 0 + constexpr unsigned N = 1; + constexpr unsigned C = 1; + constexpr unsigned HI = 34; + constexpr unsigned WI = 34; + constexpr unsigned K = 1; + constexpr unsigned S = 3; + constexpr unsigned R = 3; +#elif 0 constexpr unsigned N = 64; constexpr unsigned C = 256; constexpr unsigned HI = 34; @@ -338,44 +346,20 @@ int main() constexpr unsigned K = 64; constexpr unsigned S = 3; constexpr unsigned R = 3; -#elif 0 - constexpr unsigned N = 72; - constexpr unsigned C = 288; - constexpr unsigned HI = 38; - constexpr unsigned WI = 38; - constexpr unsigned K = 72; +#elif 1 + constexpr unsigned N = 64; + constexpr unsigned C = 64; + constexpr unsigned HI = 56; + constexpr unsigned WI = 56; + constexpr unsigned K = 64; constexpr unsigned S = 3; constexpr unsigned R = 3; #elif 0 - constexpr unsigned N = 1; - constexpr unsigned C = 1; - constexpr unsigned HI = 18; - constexpr unsigned WI = 18; - constexpr unsigned K = 1; - constexpr unsigned S = 3; - constexpr unsigned R = 3; -#elif 0 - constexpr unsigned N = 1; - constexpr unsigned C = 1; - constexpr unsigned HI = 4; - constexpr unsigned WI = 4; - constexpr unsigned K = 1; - constexpr unsigned S = 3; - constexpr unsigned R = 3; -#elif 0 - constexpr unsigned N = 2; - constexpr unsigned C = 3; - constexpr unsigned HI = 130; - constexpr unsigned WI = 130; - constexpr unsigned K = 5; - constexpr unsigned S = 3; - constexpr unsigned R = 3; -#elif 0 - constexpr unsigned N = 3; - constexpr unsigned C = 16; - constexpr unsigned HI = 130; - constexpr unsigned WI = 130; - constexpr unsigned K = 4; + constexpr unsigned N = 64; + constexpr unsigned C = 64; + constexpr unsigned HI = 66; + constexpr unsigned WI = 66; + constexpr unsigned K = 64; constexpr unsigned S = 3; constexpr unsigned R = 3; #endif @@ -397,7 +381,7 @@ int main() std::size_t num_thread = std::thread::hardware_concurrency(); in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); -#elif 0 +#elif 1 std::size_t num_thread = std::thread::hardware_concurrency(); in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); @@ -412,7 +396,7 @@ int main() #endif } -#if 0 +#if 1 host_winograd_3x3_convolution(in, wei, out_host); check_error(out_host, out_device); #elif 0 diff --git a/driver/device_direct_convolution_1.cuh b/driver/device_direct_convolution_1.cuh index 95b10c3a61..fbd1b98d41 100644 --- a/driver/device_direct_convolution_1.cuh +++ b/driver/device_direct_convolution_1.cuh @@ -21,9 +21,11 @@ void device_direct_convolution_1( constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_desc = InDesc{}; - constexpr auto wei_desc = WeiDesc{}; - constexpr auto out_desc = OutDesc{}; + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + +#if 0 constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; @@ -37,6 +39,21 @@ void device_direct_convolution_1( constexpr unsigned CPerThread = 2; constexpr unsigned BlockSize = 128; +#elif 1 + constexpr unsigned OutTileSizeH = 2; + constexpr unsigned OutTileSizeW = 2; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 16; + constexpr unsigned CPerBlock = 2; + constexpr unsigned YPerBlock = 2; + constexpr unsigned XPerBlock = 27; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + + constexpr unsigned BlockSize = 216; +#endif constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) * diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh index f37af98086..5c08870a26 100644 --- a/driver/device_direct_convolution_2.cuh +++ b/driver/device_direct_convolution_2.cuh @@ -21,9 +21,11 @@ void device_direct_convolution_2( constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_desc = InDesc{}; - constexpr auto wei_desc = WeiDesc{}; - constexpr auto out_desc = OutDesc{}; + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + +#if 0 constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; @@ -37,6 +39,35 @@ void device_direct_convolution_2( constexpr unsigned CPerThread = 2; constexpr unsigned BlockSize = 128; +#elif 1 + constexpr unsigned OutTileSizeH = 2; + constexpr unsigned OutTileSizeW = 2; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned YPerBlock = 1; + constexpr unsigned XPerBlock = 27; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + + constexpr unsigned BlockSize = 216; +#elif 1 + constexpr unsigned OutTileSizeH = 2; + constexpr unsigned OutTileSizeW = 2; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned YPerBlock = 1; + constexpr unsigned XPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + + constexpr unsigned BlockSize = 256; +#endif constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *