diff --git a/CMakeLists.txt b/CMakeLists.txt index 150632d790..1c995eb131 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,11 @@ -cmake_minimum_required(VERSION 3.9) -project(convolution LANGUAGES CXX CUDA) +cmake_minimum_required(VERSION 2.8.3) +project(modular_convolution) #c++ +enable_language(CXX) +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") -add_compile_options(-std=c++14) #boost find_package(Boost REQUIRED) @@ -28,20 +30,11 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) link_libraries(${OpenMP_pthread_LIBRARY}) endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) -#python -find_package(PythonLibs 3 REQUIRED) - -message("PYTHON_INCLUDE_DIRS: ${PYTHON_INCLUDE_DIRS}") -message("PYTHON_LIBRARIES: ${PYTHON_LIBRARIES}") - -include_directories(BEFORE ${PYTHON_INCLUDE_DIRS}) -link_libraries(${PYTHON_LIBRARIES}) - #cuda +enable_language(CUDA) include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) # include_directories(BEFORE src/include) add_subdirectory(src) add_subdirectory(driver) - diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 25c23dea5a..1641ac0f20 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -1,2 +1,2 @@ -add_executable(conv EXCLUDE_FROM_ALL conv.cu) -target_link_libraries(conv convolution) +add_executable(conv conv.cu) +target_link_libraries(conv tensor device) diff --git a/driver/conv.cu b/driver/conv.cu index b5bf0a2473..fa3577fce6 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -2,13 +2,12 @@ #include #include #include -#include "nvToolsExt.h" #include "tensor.hpp" #include "ConstantTensorDescriptor.cuh" #include "conv_common.cuh" #include "device_direct_convolution_1.cuh" #include "device_direct_convolution_2.cuh" -#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh" +#include "device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh" #include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" @@ -590,7 +589,7 @@ int main() #elif 0 device_direct_convolution_2 #elif 0 - device_implicit_gemm_convolution_1_nchw_kcsr + device_implicit_gemm_convolution_1_nchw_kcsr_nkhw #elif 0 device_implicit_gemm_convolution_1_nchw_srck_nkhw #elif 0 @@ -602,7 +601,7 @@ int main() #endif (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(in_nchw_desc, in_nchw, wei_kcsr_desc, @@ -614,7 +613,7 @@ int main() nrepeat); #endif -#if 0 +#if 1 if(S == 3 && R == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); diff --git a/driver/device_direct_convolution_1.cuh b/driver/device_direct_convolution_1.cuh index 1029026e67..2a1aef83a7 100644 --- a/driver/device_direct_convolution_1.cuh +++ b/driver/device_direct_convolution_1.cuh @@ -1,6 +1,7 @@ #pragma once -#include "gridwise_direct_convolution_1.cuh" #include +#include "device.hpp" +#include "gridwise_direct_convolution_1.cuh" template void device_direct_convolution_1(InDesc, @@ -32,6 +33,7 @@ void device_direct_convolution_1(InDesc, constexpr auto out_desc = OutDesc{}; #if 1 + // 3x3, 34x34 constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; @@ -45,20 +47,6 @@ void device_direct_convolution_1(InDesc, constexpr unsigned CPerThread = 2; constexpr unsigned BlockSize = 128; -#elif 1 - constexpr unsigned OutTileSizeH = 2; - constexpr unsigned OutTileSizeW = 2; - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 16; - constexpr unsigned CPerBlock = 2; - constexpr unsigned YPerBlock = 2; - constexpr unsigned XPerBlock = 27; - - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; - - constexpr unsigned BlockSize = 216; #endif constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * @@ -73,45 +61,36 @@ void device_direct_convolution_1(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - cudaEvent_t start, stop; - float elapsedTime; + const void* f = reinterpret_cast(gridwise_direct_convolution_1); - cudaEventCreate(&start); - cudaEventRecord(start, 0); + T* in_dev_ptr = static_cast(in_device_buf.GetDeviceBuffer()); + T* wei_dev_ptr = static_cast(wei_device_buf.GetDeviceBuffer()); + T* out_dev_ptr = static_cast(out_device_buf.GetDeviceBuffer()); - gridwise_direct_convolution_1 - <<>>(InDesc{}, - static_cast(in_device_buf.GetDeviceBuffer()), - WeiDesc{}, - static_cast(wei_device_buf.GetDeviceBuffer()), - OutDesc{}, - static_cast(out_device_buf.GetDeviceBuffer())); + void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + float time = 0; - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); + launch_kernel(f, grid_dim, block_dim, args, time); - usleep(10000); + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); } - checkCudaErrors(cudaGetLastError()); out_device_buf.FromDevice(out.mData.data()); } diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh index 95b8fedf1f..172cfb55a3 100644 --- a/driver/device_direct_convolution_2.cuh +++ b/driver/device_direct_convolution_2.cuh @@ -1,6 +1,7 @@ #pragma once -#include "gridwise_direct_convolution_2.cuh" #include +#include "device.hpp" +#include "gridwise_direct_convolution_2.cuh" template void device_direct_convolution_2(InDesc, @@ -32,6 +33,7 @@ void device_direct_convolution_2(InDesc, constexpr auto out_desc = OutDesc{}; #if 1 + // 3x3, 34x34, 128 thread constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; @@ -46,20 +48,7 @@ void device_direct_convolution_2(InDesc, constexpr unsigned BlockSize = 128; #elif 0 - constexpr unsigned OutTileSizeH = 2; - constexpr unsigned OutTileSizeW = 2; - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; - constexpr unsigned YPerBlock = 1; - constexpr unsigned XPerBlock = 27; - - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; - - constexpr unsigned BlockSize = 216; -#elif 0 + // 3x3, 34x34, 256 thread constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; @@ -87,45 +76,36 @@ void device_direct_convolution_2(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - cudaEvent_t start, stop; - float elapsedTime; + const void* f = reinterpret_cast(gridwise_direct_convolution_2); - cudaEventCreate(&start); - cudaEventRecord(start, 0); + T* in_dev_ptr = static_cast(in_device_buf.GetDeviceBuffer()); + T* wei_dev_ptr = static_cast(wei_device_buf.GetDeviceBuffer()); + T* out_dev_ptr = static_cast(out_device_buf.GetDeviceBuffer()); - gridwise_direct_convolution_2 - <<>>(InDesc{}, - static_cast(in_device_buf.GetDeviceBuffer()), - WeiDesc{}, - static_cast(wei_device_buf.GetDeviceBuffer()), - OutDesc{}, - static_cast(out_device_buf.GetDeviceBuffer())); + void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + float time = 0; - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); + launch_kernel(f, grid_dim, block_dim, args, time); - usleep(10000); + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); } - checkCudaErrors(cudaGetLastError()); out_device_buf.FromDevice(out.mData.data()); } diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index 7976492690..b588d3d2b7 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -1,6 +1,7 @@ #pragma once -#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" #include +#include "device.hpp" +#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" template void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, @@ -73,21 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, wei_csrk_device_buf.ToDevice(wei_csrk.mData.data()); out_khwn_device_buf.ToDevice(out_khwn.mData.data()); -#if 0 - constexpr unsigned NPerBlock = 1; - constexpr unsigned KPerBlock = 1; - constexpr unsigned CPerBlock = 1; - constexpr unsigned HoPerBlock = 2; - constexpr unsigned WoPerBlock = 4; - - constexpr unsigned NPerThread = 1; - constexpr unsigned KPerThread = 1; - constexpr unsigned CPerThread = 1; - constexpr unsigned HoPerThread = 1; - constexpr unsigned WoPerThread = 1; - - constexpr unsigned BlockSize = 8; -#elif 0 +#if 1 // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 64; @@ -214,50 +201,42 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - cudaEvent_t start, stop; - float elapsedTime; + const void* f = reinterpret_cast( + gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn); - cudaEventCreate(&start); - cudaEventRecord(start, 0); + T* in_dev_ptr = static_cast(in_chwn_device_buf.GetDeviceBuffer()); + T* wei_dev_ptr = static_cast(wei_csrk_device_buf.GetDeviceBuffer()); + T* out_dev_ptr = static_cast(out_khwn_device_buf.GetDeviceBuffer()); - gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn - <<>>(in_chwn_desc, - static_cast(in_chwn_device_buf.GetDeviceBuffer()), - wei_csrk_desc, - static_cast(wei_csrk_device_buf.GetDeviceBuffer()), - out_khwn_desc, - static_cast(out_khwn_device_buf.GetDeviceBuffer())); + void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + float time = 0; - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); + launch_kernel(f, grid_dim, block_dim, args, time); - usleep(std::min(elapsedTime * 1000, float(10000))); + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); } - checkCudaErrors(cudaGetLastError()); out_khwn_device_buf.FromDevice(out_khwn.mData.data()); // reorder output diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh index f4cc40c71c..4540fc534d 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh @@ -1,8 +1,8 @@ #pragma once +#include +#include "device.hpp" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh" -#include -#include template void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, @@ -172,7 +172,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 128; -#elif 0 +#elif 1 // 3x3 56x56, NKC = 16,256,128, with padding // 3x3 28x28, NKC = 16,512,256, with padding // 3x3 20x84, NKC = 16,256,256, with padding @@ -222,7 +222,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // for 1x1, 28x28 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 128; @@ -253,16 +253,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - cudaEvent_t start, stop; - float elapsedTime; - - cudaEventCreate(&start); - cudaEventRecord(start, 0); - -#if 1 + const void* f = reinterpret_cast( +#if 0 gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded #elif 1 - gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline + gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline #endif - <<>>(static_cast(in_chwn_device_buf.GetDeviceBuffer()), - static_cast(wei_csrk_device_buf.GetDeviceBuffer()), - static_cast(out_khwn_device_buf.GetDeviceBuffer())); + WeiBlockCopyThreadPerDim1>); - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + T* in_dev_ptr = static_cast(in_chwn_device_buf.GetDeviceBuffer()); + T* wei_dev_ptr = static_cast(wei_csrk_device_buf.GetDeviceBuffer()); + T* out_dev_ptr = static_cast(out_khwn_device_buf.GetDeviceBuffer()); - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); + void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - usleep(std::min(elapsedTime * 1000, float(10000))); + float time = 0; + + launch_kernel(f, grid_dim, block_dim, args, time); + + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); } - checkCudaErrors(cudaGetLastError()); out_khwn_device_buf.FromDevice(out_khwn.mData.data()); // reorder output diff --git a/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh b/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh deleted file mode 100644 index af4460335b..0000000000 --- a/driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh +++ /dev/null @@ -1,126 +0,0 @@ -#pragma once -#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh" -#include - -template -void device_implicit_gemm_convolution_1_nchw_kcsr(InDesc, - const Tensor& in, - WeiDesc, - const Tensor& wei, - OutDesc, - Tensor& out, - unsigned nrepeat) -{ - std::size_t data_sz = sizeof(T); - DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); - DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); - - int num_thread = std::thread::hardware_concurrency(); - - in_device_buf.ToDevice(in.mData.data()); - wei_device_buf.ToDevice(wei.mData.data()); - out_device_buf.ToDevice(out.mData.data()); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_desc = InDesc{}; - constexpr auto wei_desc = WeiDesc{}; - constexpr auto out_desc = OutDesc{}; - -#if 0 - constexpr unsigned NPerBlock = 1; - constexpr unsigned KPerBlock = 1; - constexpr unsigned CPerBlock = 1; - constexpr unsigned HoPerBlock = 2; - constexpr unsigned WoPerBlock = 32; - - constexpr unsigned KPerThread = 1; - constexpr unsigned CPerThread = 1; - constexpr unsigned HoPerThread = 2; - constexpr unsigned WoPerThread = 2; - - constexpr unsigned BlockSize = 16; -#elif 1 - constexpr unsigned NPerBlock = 1; - constexpr unsigned KPerBlock = 64; - constexpr unsigned CPerBlock = 2; - constexpr unsigned HoPerBlock = 4; - constexpr unsigned WoPerBlock = 32; - - constexpr unsigned KPerThread = 16; - constexpr unsigned CPerThread = 1; - constexpr unsigned HoPerThread = 2; - constexpr unsigned WoPerThread = 2; - - constexpr unsigned BlockSize = 128; -#elif 0 - constexpr unsigned NPerBlock = 1; - constexpr unsigned KPerBlock = 64; - constexpr unsigned CPerBlock = 4; - constexpr unsigned HoPerBlock = 4; - constexpr unsigned WoPerBlock = 32; - - constexpr unsigned KPerThread = 8; - constexpr unsigned CPerThread = 2; - constexpr unsigned HoPerThread = 2; - constexpr unsigned WoPerThread = 4; - - constexpr unsigned BlockSize = 128; -#endif - - constexpr unsigned GridSize = - (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) * - (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock); - - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); - - for(unsigned i = 0; i < nrepeat; ++i) - { - cudaEvent_t start, stop; - float elapsedTime; - - cudaEventCreate(&start); - cudaEventRecord(start, 0); - - gridwise_implicit_gemm_convolution_1_nchw_kcsr - <<>>(InDesc{}, - static_cast(in_device_buf.GetDeviceBuffer()), - WeiDesc{}, - static_cast(wei_device_buf.GetDeviceBuffer()), - OutDesc{}, - static_cast(out_device_buf.GetDeviceBuffer())); - - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); - - usleep(10000); - } - - checkCudaErrors(cudaGetLastError()); - out_device_buf.FromDevice(out.mData.data()); -} diff --git a/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh b/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh new file mode 100644 index 0000000000..315d0255c5 --- /dev/null +++ b/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh @@ -0,0 +1,94 @@ +#pragma once +#include +#include "device.hpp" +#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh" + +template +void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc, + const Tensor& in, + WeiDesc, + const Tensor& wei, + OutDesc, + Tensor& out, + unsigned nrepeat) +{ + std::size_t data_sz = sizeof(T); + DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); + DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); + + int num_thread = std::thread::hardware_concurrency(); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + out_device_buf.ToDevice(out.mData.data()); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + +#if 1 + // 3x3, 34x34 + constexpr unsigned NPerBlock = 1; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 2; + constexpr unsigned HoPerBlock = 4; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned BlockSize = 128; +#endif + + constexpr unsigned GridSize = + (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) * + (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock); + + dim3 block_dim(BlockSize); + dim3 grid_dim(GridSize); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + for(unsigned i = 0; i < nrepeat; ++i) + { + const void* f = reinterpret_cast( + gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw); + + T* in_dev_ptr = static_cast(in_device_buf.GetDeviceBuffer()); + T* wei_dev_ptr = static_cast(wei_device_buf.GetDeviceBuffer()); + T* out_dev_ptr = static_cast(out_device_buf.GetDeviceBuffer()); + + void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; + + float time = 0; + + launch_kernel(f, grid_dim, block_dim, args, time); + + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); + } + + out_device_buf.FromDevice(out.mData.data()); +} diff --git a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh index f6f9ccdbc1..d185ba6baa 100644 --- a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh @@ -1,6 +1,7 @@ #pragma once -#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include +#include "device.hpp" +#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" template void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, @@ -52,20 +53,7 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, wei_srck_device_buf.ToDevice(wei_srck.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 0 - constexpr unsigned NPerBlock = 1; - constexpr unsigned KPerBlock = 1; - constexpr unsigned CPerBlock = 1; - constexpr unsigned HoPerBlock = 2; - constexpr unsigned WoPerBlock = 32; - - constexpr unsigned KPerThread = 1; - constexpr unsigned CPerThread = 1; - constexpr unsigned HoPerThread = 2; - constexpr unsigned WoPerThread = 2; - - constexpr unsigned BlockSize = 16; -#elif 0 +#if 1 // for 3x3, 34x34 constexpr unsigned NPerBlock = 1; constexpr unsigned KPerBlock = 64; @@ -123,45 +111,37 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - cudaEvent_t start, stop; - float elapsedTime; + const void* f = reinterpret_cast( + gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw); - cudaEventCreate(&start); - cudaEventRecord(start, 0); + T* in_dev_ptr = static_cast(in_nchw_device_buf.GetDeviceBuffer()); + T* wei_dev_ptr = static_cast(wei_srck_device_buf.GetDeviceBuffer()); + T* out_dev_ptr = static_cast(out_nkhw_device_buf.GetDeviceBuffer()); - gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw - <<>>(in_nchw_desc, - static_cast(in_nchw_device_buf.GetDeviceBuffer()), - wei_srck_desc, - static_cast(wei_srck_device_buf.GetDeviceBuffer()), - out_nkhw_desc, - static_cast(out_nkhw_device_buf.GetDeviceBuffer())); + void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + float time = 0; - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); + launch_kernel(f, grid_dim, block_dim, args, time); - usleep(10000); + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); } - checkCudaErrors(cudaGetLastError()); out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); } diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index 7f3ddc7299..4dc248b496 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -1,7 +1,8 @@ #pragma once +#include +#include "device.hpp" #include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh" -#include template void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, @@ -69,6 +70,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, #if 0 // 3x3, 34x34 + // need to use register double buffer for GEMM constexpr unsigned BPerBlock = 128; constexpr unsigned KPerBlock = 64; constexpr unsigned CPerBlock = 4; @@ -211,60 +213,53 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - cudaEvent_t start, stop; - float elapsedTime; - cudaEventCreate(&start); - cudaEventRecord(start, 0); - + const void* f = reinterpret_cast( #if 0 - gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw + gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw #else - gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer + gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer #endif - - <<>>(in_cnhw_desc, - static_cast(in_cnhw_device_buf.GetDeviceBuffer()), - wei_csrk_desc, - static_cast(wei_csrk_device_buf.GetDeviceBuffer()), - out_knhw_desc, - static_cast(out_knhw_device_buf.GetDeviceBuffer())); + ); - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + T* in_dev_ptr = static_cast(in_cnhw_device_buf.GetDeviceBuffer()); + T* wei_dev_ptr = static_cast(wei_csrk_device_buf.GetDeviceBuffer()); + T* out_dev_ptr = static_cast(out_knhw_device_buf.GetDeviceBuffer()); - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); + void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - usleep(std::min(elapsedTime * 1000, float(10000))); + float time; + + launch_kernel(f, grid_dim, block_dim, args, time); + + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); } - checkCudaErrors(cudaGetLastError()); out_knhw_device_buf.FromDevice(out_knhw.mData.data()); // convert out_knhw to out_nkhw diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index 11cde76b89..77ceb05f0d 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -1,7 +1,8 @@ #pragma once +#include +#include "device.hpp" #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh" -#include template void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, @@ -100,7 +101,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, constexpr unsigned InBlockCopyThreadPerDim1 = 16; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // 1x1, 28x28 constexpr unsigned BPerBlock = 64; constexpr unsigned KPerBlock = 64; @@ -140,50 +141,43 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - cudaEvent_t start, stop; - float elapsedTime; - cudaEventCreate(&start); - cudaEventRecord(start, 0); - -#if 0 - gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw + const void* f = reinterpret_cast( +#if 1 + gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw #else - gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline + gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline #endif - - <<>>(in_cnhw_desc, - static_cast(in_cnhw_device_buf.GetDeviceBuffer()), - wei_srck_desc, - static_cast(wei_srck_device_buf.GetDeviceBuffer()), - out_knhw_desc, - static_cast(out_knhw_device_buf.GetDeviceBuffer())); + ); - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + T* in_dev_ptr = static_cast(in_cnhw_device_buf.GetDeviceBuffer()); + T* wei_dev_ptr = static_cast(wei_srck_device_buf.GetDeviceBuffer()); + T* out_dev_ptr = static_cast(out_knhw_device_buf.GetDeviceBuffer()); - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); + void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - usleep(std::min(elapsedTime * 1000, float(10000))); + float time = 0; + + launch_kernel(f, grid_dim, block_dim, args, time); + + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); } - checkCudaErrors(cudaGetLastError()); out_knhw_device_buf.FromDevice(out_knhw.mData.data()); // convert out_knhw to out_nkhw diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b88c759ac9..35ef38b758 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,17 +1,21 @@ -set(SOURCE +set(TENSOR_SOURCE tensor.cpp; ) -add_library(convolution SHARED ${SOURCE}) -set_target_properties(convolution PROPERTIES PREFIX "") +add_library(tensor SHARED ${TENSOR_SOURCE}) +set_target_properties(tensor PROPERTIES PREFIX "") +target_compile_features(tensor PUBLIC) +set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) +install(TARGETS tensor LIBRARY DESTINATION lib) -# boost.python -target_link_libraries(convolution boost_python3) -# cuda -target_link_libraries(convolution nvToolsExt cudart) -target_compile_features(convolution PUBLIC) -set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON) -set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) +set(DEVICE_SOURCE + device.cu; +) -install(TARGETS convolution LIBRARY DESTINATION lib) +add_library(device SHARED ${DEVICE_SOURCE}) +set_target_properties(device PROPERTIES PREFIX "") +target_compile_features(device PUBLIC) +set_target_properties(device PROPERTIES POSITION_INDEPENDENT_CODE ON) +install(TARGETS device LIBRARY DESTINATION lib) +target_link_libraries(device nvToolsExt cudart) diff --git a/src/device.cu b/src/device.cu new file mode 100644 index 0000000000..259522aa09 --- /dev/null +++ b/src/device.cu @@ -0,0 +1,79 @@ +#include "device.hpp" +#include "cuda_runtime.h" +#include "nvToolsExt.h" +#include "helper_cuda.h" + +DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) +{ + checkCudaErrors(cudaMalloc(static_cast(&mpDeviceBuf), mMemSize)); +} + +void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } + +void DeviceMem::ToDevice(const void* p) +{ + checkCudaErrors( + cudaMemcpy(mpDeviceBuf, const_cast(p), mMemSize, cudaMemcpyHostToDevice)); +} + +void DeviceMem::FromDevice(void* p) +{ + checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost)); +} + +DeviceMem::~DeviceMem() { checkCudaErrors(cudaFree(mpDeviceBuf)); } + +struct KernelTimerImpl +{ + KernelTimerImpl() + { + cudaEventCreate(&mStart); + cudaEventCreate(&mEnd); + } + + ~KernelTimerImpl() + { + cudaEventDestroy(mStart); + cudaEventDestroy(mEnd); + } + + void Start() { cudaEventRecord(mStart, 0); } + + void End() + { + cudaEventRecord(mEnd, 0); + cudaEventSynchronize(mEnd); + } + + float GetElapsedTime() const + { + float time; + cudaEventElapsedTime(&time, mStart, mEnd); + return time; + } + + cudaEvent_t mStart, mEnd; +}; + +KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {} + +KernelTimer::~KernelTimer() {} + +void KernelTimer::Start() { impl->Start(); } + +void KernelTimer::End() { impl->End(); } + +float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); } + +void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time) +{ + KernelTimer timer; + timer.Start(); + + cudaError_t error = cudaLaunchKernel(func, grid_dim, block_dim, args, 0, 0); + + timer.End(); + time = timer.GetElapsedTime(); + + checkCudaErrors(error); +} diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.cuh index ff4476d0b7..d1463bd807 100644 --- a/src/include/blockwise_2d_tensor_op.cuh +++ b/src/include/blockwise_2d_tensor_op.cuh @@ -513,7 +513,6 @@ struct Blockwise2dTensorCopy3 } } -#if 1 __device__ constexpr unsigned GetRegisterClipboardSize() const { static_assert(is_same::value, "wrong! only support float!\n"); @@ -703,5 +702,4 @@ struct Blockwise2dTensorCopy3 } } } -#endif }; diff --git a/src/include/blockwise_4d_tensor_op.cuh b/src/include/blockwise_4d_tensor_op.cuh index 5cc53977cb..da197338a3 100644 --- a/src/include/blockwise_4d_tensor_op.cuh +++ b/src/include/blockwise_4d_tensor_op.cuh @@ -88,7 +88,7 @@ template __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc, - Float* const __restrict__ p_src, + const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths, @@ -187,7 +187,7 @@ template __device__ void blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, - Float* const __restrict__ p_src, + const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths, @@ -202,7 +202,7 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, template struct Blockwise4dTensorCopy1 { - __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; @@ -219,7 +219,7 @@ template struct BlockwiseChwnTensorCopyPadded { - __device__ void Run(Float* const __restrict__ p_src, + __device__ void Run(const Float* __restrict__ p_src, unsigned c_block_data_begin, unsigned ho_block_data_begin, unsigned wo_block_data_begin, @@ -244,7 +244,7 @@ struct BlockwiseChwnTensorCopyPadded constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; - Float* const p_src_tmp = + const Float* p_src_tmp = p_src + src_desc.Get1dIndex(c_block_data_begin, (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, @@ -336,4 +336,4 @@ struct BlockwiseChwnTensorCopyPadded } } } -}; \ No newline at end of file +}; diff --git a/src/include/device.hpp b/src/include/device.hpp new file mode 100644 index 0000000000..aa13b1857e --- /dev/null +++ b/src/include/device.hpp @@ -0,0 +1,30 @@ +#pragma once +#include + +struct DeviceMem +{ + DeviceMem() = delete; + DeviceMem(std::size_t mem_size); + void* GetDeviceBuffer(); + void ToDevice(const void* p); + void FromDevice(void* p); + ~DeviceMem(); + + void* mpDeviceBuf; + std::size_t mMemSize; +}; + +struct KernelTimerImpl; + +struct KernelTimer +{ + KernelTimer(); + ~KernelTimer(); + void Start(); + void End(); + float GetElapsedTime() const; + + std::unique_ptr impl; +}; + +void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time); diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh index ab7f14dd06..32714c212e 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.cuh @@ -19,12 +19,9 @@ template -__global__ void gridwise_direct_convolution_1(InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) +__global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index 408fb5e05a..87b77e5e3e 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -21,12 +21,9 @@ template -__global__ void gridwise_direct_convolution_2(InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) +__global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index bc58bf77ad..f45a6094ca 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -28,12 +28,9 @@ template __global__ void -gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) +gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh index 4f5c78de87..7bdeb75805 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh @@ -27,10 +27,10 @@ template -__global__ void -gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restrict__ p_in_global, - Float* const __restrict__ p_wei_global, - Float* __restrict__ p_out_global) +__global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( + const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" @@ -143,7 +143,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri decltype(in_chwn_block_desc.GetLengths()), LowerPads>{}; -#if 1 +#if 0 // weight: format is [C,S,R,K] constexpr auto blockwise_wei_copy = Blockwise4dTensorCopy1{}; -#elif 1 +#elif 0 // weight: format is [C*S*R,K] constexpr auto blockwise_wei_copy = Blockwise2dTensorCopy1 __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline( - Float* const __restrict__ p_in_global, - Float* const __restrict__ p_wei_global, - Float* __restrict__ p_out_global) + const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" @@ -220,7 +220,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p // set threadwise output tensor to 0 threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread); - Float* p_wei_global_block_begin = + const Float* p_wei_global_block_begin = p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin); // prelog: load data diff --git a/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh b/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh similarity index 96% rename from src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh rename to src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh index 0da67daa23..27b1ff1917 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh @@ -22,12 +22,9 @@ template __global__ void -gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) +gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" diff --git a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh index 9dae99e181..e1021d0f10 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh @@ -23,12 +23,9 @@ template __global__ void -gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) +gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { // NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N] // for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N" diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index b2533448f6..86b0c04171 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -35,11 +35,8 @@ template __global__ void -gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, - const Float* const __restrict__ p_in_global, - WeiGlobalDesc, +gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, - OutGlobalDesc, Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh index 16165f6195..5b6f77cd0b 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh @@ -35,12 +35,9 @@ template __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer( - InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) + const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index 4e64690722..29f9b3b81a 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -25,12 +25,9 @@ template __global__ void -gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) +gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -174,10 +171,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, // set threadwise output tensor to 0 threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); - Float* p_in_global_block_offset = + const Float* p_in_global_block_offset = p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); - Float* p_wei_global_block_offset = + const Float* p_wei_global_block_offset = p_wei_global + wei_srck_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh index 0375debfff..82fac0c76a 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh @@ -25,12 +25,9 @@ template __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline( - InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) + const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/gridwise_winograd_convolution.cuh b/src/include/gridwise_winograd_convolution.cuh index 3d5b739263..8656808831 100644 --- a/src/include/gridwise_winograd_convolution.cuh +++ b/src/include/gridwise_winograd_convolution.cuh @@ -19,12 +19,9 @@ template -__global__ void gridwise_winograd_convolution(InGlobalDesc, - Float* const __restrict__ p_in_global, - WeiGlobalDesc, - Float* const __restrict__ p_wei_global, - OutGlobalDesc, - Float* __restrict__ p_out_global) +__global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -228,4 +225,4 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, k_block_data_begin + k_thread_data_begin, ho_block_data_begin + y_thread_data_begin * OutTileSizeH, wo_block_data_begin + x_thread_data_begin * OutTileSizeW)); -} \ No newline at end of file +} diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index 39e8949a21..09ac224007 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -6,8 +6,6 @@ #include #include #include -#include "cuda_runtime.h" -#include "helper_cuda.h" template std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim) @@ -108,33 +106,6 @@ struct TensorDescriptor std::vector mStrides; }; -struct DeviceMem -{ - DeviceMem() = delete; - DeviceMem(std::size_t mem_size) : mMemSize(mem_size) - { - cudaMalloc(static_cast(&mpDeviceBuf), mMemSize); - } - - void* GetDeviceBuffer() { return mpDeviceBuf; } - - int ToDevice(const void* p) - { - return static_cast( - cudaMemcpy(mpDeviceBuf, const_cast(p), mMemSize, cudaMemcpyHostToDevice)); - } - - int FromDevice(void* p) - { - return static_cast(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost)); - } - - ~DeviceMem() { cudaFree(mpDeviceBuf); } - - void* mpDeviceBuf; - std::size_t mMemSize; -}; - struct joinable_thread : std::thread { template