diff --git a/driver/conv.cu b/driver/conv.cu index 5c899e00c9..2704db43f6 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -170,7 +170,7 @@ int main() int num_thread = std::thread::hardware_concurrency(); -#if 1 +#if 0 in.GenerateTensorValue(GeneratorTensor{}, num_thread); wei.GenerateTensorValue(GeneratorTensor{}, num_thread); #endif @@ -180,7 +180,7 @@ int main() device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); } -#if 1 +#if 0 host_convolution(in, wei, out_host); float error = 0; diff --git a/driver/device_direct_convolution_3.cuh b/driver/device_direct_convolution_3.cuh index b449c3e857..2be9201f84 100644 --- a/driver/device_direct_convolution_3.cuh +++ b/driver/device_direct_convolution_3.cuh @@ -27,9 +27,9 @@ void device_convolution( constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 8; + constexpr unsigned KPerBlock = 32; constexpr unsigned CPerBlock = 2; - constexpr unsigned YPerBlock = 4; + constexpr unsigned YPerBlock = 1; constexpr unsigned XPerBlock = 16; constexpr unsigned NPerThread = 2;