From 67c6f73ffe0dc06659757c8e28901187394de77b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 15 Feb 2019 00:54:30 -0600 Subject: [PATCH] hip build --- CMakeLists.txt | 20 ++- build/{cmake.sh => cmake-cuda.sh} | 0 build/cmake-hip.sh | 16 +++ driver/CMakeLists.txt | 6 +- driver/device_direct_convolution_1.cuh | 50 ++++---- driver/device_direct_convolution_2.cuh | 50 ++++---- ...icit_gemm_convolution_1_chwn_csrk_khwn.cuh | 22 ++-- ...mm_convolution_1_chwn_csrk_khwn_padded.cuh | 23 ++-- ...icit_gemm_convolution_1_nchw_kcsr_nkhw.cuh | 49 +++----- ...icit_gemm_convolution_1_nchw_srck_nkhw.cuh | 22 ++-- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 22 ++-- ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 21 ++-- driver/{conv.cu => driver.cpp} | 17 +-- src/CMakeLists.txt | 22 ++-- src/device.cpp | 114 ++++++++++++++++++ src/device.cu | 79 ------------ src/include/blockwise_4d_tensor_op.cuh | 9 +- src/include/blockwise_direct_convolution.cuh | 27 +++-- src/include/blockwise_gemm.cuh | 14 ++- src/include/common.cuh | 2 +- src/include/config.h.in | 11 ++ src/include/device.hpp | 30 ++++- src/include/gridwise_direct_convolution_1.cuh | 20 +-- src/include/gridwise_direct_convolution_2.cuh | 39 +++--- ...icit_gemm_convolution_1_chwn_csrk_khwn.cuh | 18 +-- ...mm_convolution_1_chwn_csrk_khwn_padded.cuh | 9 +- ...n_1_chwn_csrk_khwn_padded_lds_pipeline.cuh | 11 +- ...icit_gemm_convolution_1_nchw_kcsr_nkhw.cuh | 27 +++-- ...icit_gemm_convolution_1_nchw_srck_nkhw.cuh | 25 ++-- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 12 +- ...ion_2_cnhw_csrk_knhw_lds_double_buffer.cuh | 14 +-- ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 4 +- ...volution_2_cnhw_srck_knhw_lds_pipeline.cuh | 17 +-- src/include/gridwise_winograd_convolution.cuh | 23 ++-- src/include/tensor.hpp | 3 +- 35 files changed, 454 insertions(+), 394 deletions(-) rename build/{cmake.sh => cmake-cuda.sh} (100%) create mode 100755 build/cmake-hip.sh rename driver/{conv.cu => driver.cpp} (98%) create mode 100644 src/device.cpp delete mode 100644 src/device.cu create mode 100644 src/include/config.h.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c995eb131..252807ff4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ project(modular_convolution) enable_language(CXX) set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") #boost @@ -16,7 +17,7 @@ message("Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}") include_directories(BEFORE ${Boost_INCLUDE_DIRS}) link_directories(${Boost_LIBRARY_DIRS}) -#openMP +#OpenMP if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) find_package(OpenMP REQUIRED) @@ -30,11 +31,20 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) link_libraries(${OpenMP_pthread_LIBRARY}) endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) -#cuda -enable_language(CUDA) -include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) +#GPU backend +if(DEVICE_BACKEND STREQUAL "HIP") + set(DEVICE_BACKEND_HIP 1) + + set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH}) + find_package(HIP REQUIRED) +elseif(DEVICE_BACKEND STREQUAL "CUDA") + set(DEVICE_BACKEND_CUDA 1) + + enable_language(CUDA) + include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) +endif() # -include_directories(BEFORE src/include) +include_directories(BEFORE src/include ${PROJECT_BINARY_DIR}/src/include) add_subdirectory(src) add_subdirectory(driver) diff --git a/build/cmake.sh b/build/cmake-cuda.sh similarity index 100% rename from build/cmake.sh rename to build/cmake-cuda.sh diff --git a/build/cmake-hip.sh b/build/cmake-hip.sh new file mode 100755 index 0000000000..e418ea2fc1 --- /dev/null +++ b/build/cmake-hip.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +rm -f CMakeCache.txt +rm -f *.cmake +rm -rf CMakeFiles + +MY_PROJECT_SOURCE=/home/chao/code/modular_convolution +MY_PROJECT_INSTALL=../install.dir + +cmake \ +-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ +-D CMAKE_BUILD_TYPE=Release \ +-D DEVICE_BACKEND="HIP" \ +-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ +-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ +${MY_PROJECT_SOURCE} diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 1641ac0f20..1497be2006 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -1,2 +1,4 @@ -add_executable(conv conv.cu) -target_link_libraries(conv tensor device) +set(DRIVER_SOURCE driver.cpp) + +add_executable(driver ${DRIVER_SOURCE}) +target_link_libraries(driver PRIVATE tensor) diff --git a/driver/device_direct_convolution_1.cuh b/driver/device_direct_convolution_1.cuh index 2a1aef83a7..9a3e9c32d2 100644 --- a/driver/device_direct_convolution_1.cuh +++ b/driver/device_direct_convolution_1.cuh @@ -54,39 +54,31 @@ void device_direct_convolution_1(InDesc, (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock)); - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); for(unsigned i = 0; i < nrepeat; ++i) { - const void* f = reinterpret_cast(gridwise_direct_convolution_1); - - T* in_dev_ptr = static_cast(in_device_buf.GetDeviceBuffer()); - T* wei_dev_ptr = static_cast(wei_device_buf.GetDeviceBuffer()); - T* out_dev_ptr = static_cast(out_device_buf.GetDeviceBuffer()); - - void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - - float time = 0; - - launch_kernel(f, grid_dim, block_dim, args, time); + float time = launch_kernel(gridwise_direct_convolution_1, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh index 172cfb55a3..cfc88e45c7 100644 --- a/driver/device_direct_convolution_2.cuh +++ b/driver/device_direct_convolution_2.cuh @@ -69,39 +69,31 @@ void device_direct_convolution_2(InDesc, (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock)); - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); for(unsigned i = 0; i < nrepeat; ++i) { - const void* f = reinterpret_cast(gridwise_direct_convolution_2); - - T* in_dev_ptr = static_cast(in_device_buf.GetDeviceBuffer()); - T* wei_dev_ptr = static_cast(wei_device_buf.GetDeviceBuffer()); - T* out_dev_ptr = static_cast(out_device_buf.GetDeviceBuffer()); - - void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - - float time = 0; - - launch_kernel(f, grid_dim, block_dim, args, time); + float time = launch_kernel(gridwise_direct_convolution_2, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index b588d3d2b7..fb36282dbc 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -194,14 +194,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) * ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock); - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); for(unsigned i = 0; i < nrepeat; ++i) { - const void* f = reinterpret_cast( + float time = launch_kernel( gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn); - - T* in_dev_ptr = static_cast(in_chwn_device_buf.GetDeviceBuffer()); - T* wei_dev_ptr = static_cast(wei_csrk_device_buf.GetDeviceBuffer()); - T* out_dev_ptr = static_cast(out_khwn_device_buf.GetDeviceBuffer()); - - void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - - float time = 0; - - launch_kernel(f, grid_dim, block_dim, args, time); + WeiBlockCopyDataPerRead>, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_chwn_device_buf.GetDeviceBuffer()), + static_cast(wei_csrk_device_buf.GetDeviceBuffer()), + static_cast(out_khwn_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh index 4540fc534d..f755c36e94 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh @@ -94,7 +94,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, constexpr unsigned WeiBlockCopyThreadPerDim1 = 1; constexpr unsigned BlockSize = 8; -#elif 0 +#elif 1 // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256 constexpr unsigned NPerBlock = 16; constexpr unsigned KPerBlock = 64; @@ -246,14 +246,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) * ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock); - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); for(unsigned i = 0; i < nrepeat; ++i) { - const void* f = reinterpret_cast( + float time = launch_kernel( #if 0 gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded #elif 1 @@ -278,17 +275,13 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, HoPerThread, WoPerThread, WeiBlockCopyThreadPerDim0, - WeiBlockCopyThreadPerDim1>); + WeiBlockCopyThreadPerDim1>, + dim3(GridSize), + dim3(BlockSize), - T* in_dev_ptr = static_cast(in_chwn_device_buf.GetDeviceBuffer()); - T* wei_dev_ptr = static_cast(wei_csrk_device_buf.GetDeviceBuffer()); - T* out_dev_ptr = static_cast(out_khwn_device_buf.GetDeviceBuffer()); - - void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - - float time = 0; - - launch_kernel(f, grid_dim, block_dim, args, time); + static_cast(in_chwn_device_buf.GetDeviceBuffer()), + static_cast(wei_csrk_device_buf.GetDeviceBuffer()), + static_cast(out_khwn_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh b/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh index 315d0255c5..e78f3b0660 100644 --- a/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh +++ b/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh @@ -52,39 +52,30 @@ void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc, (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) * (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock); - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); for(unsigned i = 0; i < nrepeat; ++i) { - const void* f = reinterpret_cast( - gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw); - - T* in_dev_ptr = static_cast(in_device_buf.GetDeviceBuffer()); - T* wei_dev_ptr = static_cast(wei_device_buf.GetDeviceBuffer()); - T* out_dev_ptr = static_cast(out_device_buf.GetDeviceBuffer()); - - void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - - float time = 0; - - launch_kernel(f, grid_dim, block_dim, args, time); + float time = launch_kernel(gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh index d185ba6baa..7f4f139d39 100644 --- a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh @@ -104,14 +104,11 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) * ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock); - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); for(unsigned i = 0; i < nrepeat; ++i) { - const void* f = reinterpret_cast( + float time = launch_kernel( gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw); - - T* in_dev_ptr = static_cast(in_nchw_device_buf.GetDeviceBuffer()); - T* wei_dev_ptr = static_cast(wei_srck_device_buf.GetDeviceBuffer()); - T* out_dev_ptr = static_cast(out_nkhw_device_buf.GetDeviceBuffer()); - - void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - - float time = 0; - - launch_kernel(f, grid_dim, block_dim, args, time); + WoPerThread>, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_nchw_device_buf.GetDeviceBuffer()), + static_cast(wei_srck_device_buf.GetDeviceBuffer()), + static_cast(out_nkhw_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index 4dc248b496..34f4745501 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -195,9 +195,6 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned GridSize = ((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock); - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); // mem @@ -213,7 +210,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - const void* f = reinterpret_cast( + float time = launch_kernel( #if 0 gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw #else @@ -244,17 +241,12 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, WeiBlockCopyThreadPerDim0, WeiBlockCopyThreadPerDim1, InBlockCopyDataPerRead, - WeiBlockCopyDataPerRead>); - - T* in_dev_ptr = static_cast(in_cnhw_device_buf.GetDeviceBuffer()); - T* wei_dev_ptr = static_cast(wei_csrk_device_buf.GetDeviceBuffer()); - T* out_dev_ptr = static_cast(out_knhw_device_buf.GetDeviceBuffer()); - - void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - - float time; - - launch_kernel(f, grid_dim, block_dim, args, time); + WeiBlockCopyDataPerRead>, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_cnhw_device_buf.GetDeviceBuffer()), + static_cast(wei_csrk_device_buf.GetDeviceBuffer()), + static_cast(out_knhw_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index 77ceb05f0d..d0bc18dbba 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -123,9 +123,6 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, constexpr unsigned GridSize = ((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock); - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); // mem @@ -141,7 +138,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - const void* f = reinterpret_cast( + float time = launch_kernel( #if 1 gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw #else @@ -162,17 +159,13 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, GemmThreadPerColumnPerCluster, GemmThreadPerRowPerCluster, InBlockCopyThreadPerDim0, - InBlockCopyThreadPerDim1>); + InBlockCopyThreadPerDim1>, + dim3(GridSize), + dim3(BlockSize), - T* in_dev_ptr = static_cast(in_cnhw_device_buf.GetDeviceBuffer()); - T* wei_dev_ptr = static_cast(wei_srck_device_buf.GetDeviceBuffer()); - T* out_dev_ptr = static_cast(out_knhw_device_buf.GetDeviceBuffer()); - - void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr}; - - float time = 0; - - launch_kernel(f, grid_dim, block_dim, args, time); + static_cast(in_cnhw_device_buf.GetDeviceBuffer()), + static_cast(wei_srck_device_buf.GetDeviceBuffer()), + static_cast(out_knhw_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/conv.cu b/driver/driver.cpp similarity index 98% rename from driver/conv.cu rename to driver/driver.cpp index fa3577fce6..80c1329d2c 100644 --- a/driver/conv.cu +++ b/driver/driver.cpp @@ -2,6 +2,7 @@ #include #include #include +#include "config.h" #include "tensor.hpp" #include "ConstantTensorDescriptor.cuh" #include "conv_common.cuh" @@ -49,7 +50,7 @@ struct GeneratorTensor_3 std::initializer_list ids = {static_cast(is)...}; std::vector lens(sizeof...(Is), 100); std::vector strides(sizeof...(Is), 1); - std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1); + std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1); return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1; #endif } @@ -339,7 +340,7 @@ void host_winograd_3x3_convolution( std::size_t ho = OutTileSizeH * y + j; for(int i = 0; i < OutTileSizeW; ++i) { - std::size_t wo = OutTileSizeW * x + i; + std::size_t wo = OutTileSizeW * x + i; out(n, k, ho, wo) = out_hold(n, k, y, x, j, i); } } @@ -392,13 +393,13 @@ int main() constexpr unsigned WPad = 0; #elif 0 // 3x3, 34x34 - constexpr unsigned N = 64; - constexpr unsigned C = 256; + constexpr unsigned N = 64; + constexpr unsigned C = 256; constexpr unsigned HI = 34; constexpr unsigned WI = 34; - constexpr unsigned K = 64; - constexpr unsigned S = 3; - constexpr unsigned R = 3; + constexpr unsigned K = 64; + constexpr unsigned S = 3; + constexpr unsigned R = 3; constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; @@ -601,7 +602,7 @@ int main() #endif (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 +#elif 1 device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(in_nchw_desc, in_nchw, wei_kcsr_desc, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 35ef38b758..a8f81127f5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,21 +1,17 @@ +configure_file("${PROJECT_SOURCE_DIR}/src/include/config.h.in" "${PROJECT_BINARY_DIR}/src/include/config.h") + set(TENSOR_SOURCE tensor.cpp; + device.cpp; ) add_library(tensor SHARED ${TENSOR_SOURCE}) -set_target_properties(tensor PROPERTIES PREFIX "") target_compile_features(tensor PUBLIC) set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) + + +if(DEVICE_BACKEND STREQUAL "CUDA") + target_link_libraries(device nvToolsExt cudart) +endif() + install(TARGETS tensor LIBRARY DESTINATION lib) - - -set(DEVICE_SOURCE - device.cu; -) - -add_library(device SHARED ${DEVICE_SOURCE}) -set_target_properties(device PROPERTIES PREFIX "") -target_compile_features(device PUBLIC) -set_target_properties(device PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device LIBRARY DESTINATION lib) -target_link_libraries(device nvToolsExt cudart) diff --git a/src/device.cpp b/src/device.cpp new file mode 100644 index 0000000000..03bd208d4d --- /dev/null +++ b/src/device.cpp @@ -0,0 +1,114 @@ +#include "config.h" +#include "device.hpp" + +DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) +{ +#if DEVICE_BACKEND_HIP + hipGetErrorString(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); +#elif DEVICE_BACKEND_CUDA + checkCudaErrors(cudaMalloc(static_cast(&mpDeviceBuf), mMemSize)); +#endif +} + +void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } + +void DeviceMem::ToDevice(const void* p) +{ +#if DEVICE_BACKEND_HIP + hipGetErrorString( + hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); +#elif DEVICE_BACKEND_CUDA + checkCudaErrors( + cudaMemcpy(mpDeviceBuf, const_cast(p), mMemSize, cudaMemcpyHostToDevice)); +#endif +} + +void DeviceMem::FromDevice(void* p) +{ +#if DEVICE_BACKEND_HIP + hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); +#elif DEVICE_BACKEND_CUDA + checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost)); +#endif +} + +DeviceMem::~DeviceMem() +{ +#if DEVICE_BACKEND_HIP + hipGetErrorString(hipFree(mpDeviceBuf)); +#elif DEVICE_BACKEND_CUDA + checkCudaErrors(cudaFree(mpDeviceBuf)); +#endif +} + +struct KernelTimerImpl +{ + KernelTimerImpl() + { +#if DEVICE_BACKEND_HIP + hipEventCreate(&mStart); + hipEventCreate(&mEnd); +#elif DEVICE_BACKEND_CUDA + cudaEventCreate(&mStart); + cudaEventCreate(&mEnd); +#endif + } + + ~KernelTimerImpl() + { +#if DEVICE_BACKEND_HIP + hipEventDestroy(mStart); + hipEventDestroy(mEnd); +#elif DEVICE_BACKEND_CUDA + cudaEventDestroy(mStart); + cudaEventDestroy(mEnd); +#endif + } + + void Start() + { +#if DEVICE_BACKEND_HIP + hipEventRecord(mStart, 0); +#elif DEVICE_BACKEND_CUDA + cudaEventRecord(mStart, 0); +#endif + } + + void End() + { +#if DEVICE_BACKEND_HIP + hipEventRecord(mEnd, 0); + hipEventSynchronize(mEnd); +#elif DEVICE_BACKEND_CUDA + cudaEventRecord(mEnd, 0); + cudaEventSynchronize(mEnd); +#endif + } + + float GetElapsedTime() const + { + float time; +#if DEVICE_BACKEND_HIP + hipEventElapsedTime(&time, mStart, mEnd); +#elif DEVICE_BACKEND_CUDA + cudaEventElapsedTime(&time, mStart, mEnd); +#endif + return time; + } + +#if DEVICE_BACKEND_HIP + hipEvent_t mStart, mEnd; +#elif DEVICE_BACKEND_CUDA + cudaEvent_t mStart, mEnd; +#endif +}; + +KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {} + +KernelTimer::~KernelTimer() {} + +void KernelTimer::Start() { impl->Start(); } + +void KernelTimer::End() { impl->End(); } + +float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); } diff --git a/src/device.cu b/src/device.cu deleted file mode 100644 index 259522aa09..0000000000 --- a/src/device.cu +++ /dev/null @@ -1,79 +0,0 @@ -#include "device.hpp" -#include "cuda_runtime.h" -#include "nvToolsExt.h" -#include "helper_cuda.h" - -DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) -{ - checkCudaErrors(cudaMalloc(static_cast(&mpDeviceBuf), mMemSize)); -} - -void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } - -void DeviceMem::ToDevice(const void* p) -{ - checkCudaErrors( - cudaMemcpy(mpDeviceBuf, const_cast(p), mMemSize, cudaMemcpyHostToDevice)); -} - -void DeviceMem::FromDevice(void* p) -{ - checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost)); -} - -DeviceMem::~DeviceMem() { checkCudaErrors(cudaFree(mpDeviceBuf)); } - -struct KernelTimerImpl -{ - KernelTimerImpl() - { - cudaEventCreate(&mStart); - cudaEventCreate(&mEnd); - } - - ~KernelTimerImpl() - { - cudaEventDestroy(mStart); - cudaEventDestroy(mEnd); - } - - void Start() { cudaEventRecord(mStart, 0); } - - void End() - { - cudaEventRecord(mEnd, 0); - cudaEventSynchronize(mEnd); - } - - float GetElapsedTime() const - { - float time; - cudaEventElapsedTime(&time, mStart, mEnd); - return time; - } - - cudaEvent_t mStart, mEnd; -}; - -KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {} - -KernelTimer::~KernelTimer() {} - -void KernelTimer::Start() { impl->Start(); } - -void KernelTimer::End() { impl->End(); } - -float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); } - -void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time) -{ - KernelTimer timer; - timer.Start(); - - cudaError_t error = cudaLaunchKernel(func, grid_dim, block_dim, args, 0, 0); - - timer.End(); - time = timer.GetElapsedTime(); - - checkCudaErrors(error); -} diff --git a/src/include/blockwise_4d_tensor_op.cuh b/src/include/blockwise_4d_tensor_op.cuh index da197338a3..693a0e4abe 100644 --- a/src/include/blockwise_4d_tensor_op.cuh +++ b/src/include/blockwise_4d_tensor_op.cuh @@ -245,10 +245,11 @@ struct BlockwiseChwnTensorCopyPadded constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; const Float* p_src_tmp = - p_src + src_desc.Get1dIndex(c_block_data_begin, - (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, - (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, - n_block_data_begin); + p_src + + src_desc.Get1dIndex(c_block_data_begin, + (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, + (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, + n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh index b60c458af3..48856ffba7 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.cuh @@ -95,10 +95,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, Float p_out_thread[out_thread_desc.GetElementSpace()]; threadwise_4d_tensor_copy(out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc, p_out_thread, out_thread_desc.GetLengths()); @@ -109,10 +110,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, // threadwise convolution threadwise_direct_convolution_2( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data_begin, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data_begin, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), @@ -124,10 +126,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, threadwise_4d_tensor_copy(out_thread_desc, p_out_thread, out_block_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc.GetLengths()); } } diff --git a/src/include/blockwise_gemm.cuh b/src/include/blockwise_gemm.cuh index 802bade82c..32c1bf71aa 100644 --- a/src/include/blockwise_gemm.cuh +++ b/src/include/blockwise_gemm.cuh @@ -305,8 +305,9 @@ struct BlockwiseGemmBlockABlockBThreadC constexpr unsigned NClusterWork = (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster); - static_assert(BlockSize == (MClusterWork * MThreadPerCluster) * - (NClusterWork * NThreadPerCluster), + static_assert(BlockSize == + (MClusterWork * MThreadPerCluster) * + (NClusterWork * NThreadPerCluster), "wrong! wrong BlockSize"); if(DistributeThreadAlongColumnFirst) @@ -685,7 +686,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr unsigned MRepeat = MPerThread / MPerThreadSubC; constexpr unsigned NRepeat = NPerThread / NPerThreadSubC; - // preload A, B +// preload A, B #pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { // copy A-sub to form A @@ -718,7 +719,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 FloatA* p_a_thread_next = even_loop ? p_a_thread_1 : p_a_thread_0; FloatB* p_b_thread_next = even_loop ? p_b_thread_1 : p_b_thread_0; - // preload next A, B +// preload next A, B #pragma unroll for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { // copy A-sub to form A @@ -906,8 +907,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), c_thread_sub_mtx, False, - p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, - n_repeat * NPerThreadSubC), + p_c_thread + + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, + n_repeat * NPerThreadSubC), f_accum); } } diff --git a/src/include/common.cuh b/src/include/common.cuh index 461ea6fe11..f63a13b259 100644 --- a/src/include/common.cuh +++ b/src/include/common.cuh @@ -62,4 +62,4 @@ struct Sequence printf("Sequence::ReorderByPutOldToNew not implemented"); assert(false); } -}; \ No newline at end of file +}; diff --git a/src/include/config.h.in b/src/include/config.h.in new file mode 100644 index 0000000000..7b888c6951 --- /dev/null +++ b/src/include/config.h.in @@ -0,0 +1,11 @@ +#pragma once +#cmakedefine01 DEVICE_BACKEND_HIP +#cmakedefine01 DEVICE_BACKEND_CUDA + +#if DEVICE_BACKEND_HIP +#include "hip/hip_runtime.h" +#elif DEVICE_BACKEND_CUDA +#include "cuda_runtime.h" +#include "nvToolsExt.h" +#include "helper_cuda.h" +#endif diff --git a/src/include/device.hpp b/src/include/device.hpp index aa13b1857e..3a131a2aa3 100644 --- a/src/include/device.hpp +++ b/src/include/device.hpp @@ -1,5 +1,6 @@ #pragma once #include +#include "config.h" struct DeviceMem { @@ -27,4 +28,31 @@ struct KernelTimer std::unique_ptr impl; }; -void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time); +template +float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args) +{ + KernelTimer timer; + +#if DEVICE_BACKEND_HIP + timer.Start(); + + hipLaunchKernelGGL(kernel, grid_dim, block_dim, 0, 0, args...); + + timer.End(); + + hipGetErrorString(hipGetLastError()); +#elif DEVICE_BACKEND_CUDA + const void* f = reinterpret_cast(kernel); + void* p_args = {&args...}; + + timer.Start(); + + cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, 0, 0); + + timer.End(); + + checkCudaErrors(error); +#endif + + return timer.GetElapsedTime(); +} diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh index 32714c212e..443924fa26 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.cuh @@ -1,4 +1,5 @@ #pragma once +#include "common.cuh" #include "ConstantTensorDescriptor.cuh" #include "blockwise_4d_tensor_op.cuh" #include "blockwise_direct_convolution.cuh" @@ -146,10 +147,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ c_block_work_begin += CPerBlock) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, - c_block_work_begin, - hi_block_work_begin, - wi_block_work_begin), + blockwise_in_copy.Run(p_in_global + + in_global_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), p_in_block); // copy weight tensor to LDS @@ -176,9 +178,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ } // copy output tensor from LDS to device mem - blockwise_out_copy.Run(p_out_block, - p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, - k_block_work_begin, - ho_block_work_begin, - wo_block_work_begin)); + blockwise_out_copy.Run( + p_out_block, + p_out_global + + out_global_desc.Get1dIndex( + n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); } diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index 87b77e5e3e..6fab112c3e 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -1,4 +1,5 @@ #pragma once +#include "common.cuh" #include "ConstantTensorDescriptor.cuh" #include "blockwise_4d_tensor_op.cuh" #include "blockwise_direct_convolution.cuh" @@ -162,10 +163,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + + in_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS @@ -177,14 +179,15 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) { - // threadwise convolution +// threadwise convolution #if 1 threadwise_direct_convolution_2( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), out_thread_desc, @@ -192,10 +195,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ #elif 0 threadwise_direct_convolution_3( in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), out_thread_desc, @@ -209,9 +213,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ out_thread_desc, p_out_thread, out_global_desc, - p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index f45a6094ca..db408c6b70 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -121,8 +121,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric decltype(in_chwn_block_desc), decltype(in_chwn_block_desc.GetLengths())>{}; - // blockwise wei copy - // format is [CPerBlock*S*R,KPerBlock] +// blockwise wei copy +// format is [CPerBlock*S*R,KPerBlock] #if 0 const auto blockwise_wei_copy = Blockwise2dTensorCopy1( in_nchw_global_desc, - p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + p_in_global + + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), in_chwn_block_desc, p_in_block, in_nchw_block_desc.GetLengths(), @@ -244,10 +245,11 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric out_hkwn_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_nkhw_from_hkwn); #else @@ -261,10 +263,11 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); #endif } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh index e1021d0f10..691675cd74 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh @@ -166,10 +166,11 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric // convert [N,C,Hi,Wi] to [C,Hi,Wi,N] blockwise_4d_tensor_copy_reorder_by_get_dst_from_src( in_nchw_global_desc, - p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + p_in_global + + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), in_chwn_block_desc, p_in_block, in_nchw_block_desc.GetLengths(), @@ -179,9 +180,10 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric #if 1 // weight: global mem to LDS, // format is [S,R,C,K], no conversion needed - blockwise_wei_copy.Run(p_wei_global + wei_srck_global_desc.Get1dIndex( - 0, 0, c_block_data_begin, k_block_data_begin), - p_wei_block); + blockwise_wei_copy.Run( + p_wei_global + + wei_srck_global_desc.Get1dIndex(0, 0, c_block_data_begin, k_block_data_begin), + p_wei_block); #endif __syncthreads(); @@ -217,10 +219,11 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric out_hkwn_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_nkhw_from_hkwn); } diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index 86b0c04171..5aec794a83 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -111,8 +111,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric } #endif - // blockwise in copy - // formmat is [CPerBlock,BPerBlock + BGhostRead] +// blockwise in copy +// formmat is [CPerBlock,BPerBlock + BGhostRead] #if 0 const auto blockwise_in_copy = Blockwise2dTensorCopy1{}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; #endif - // blockwise wei copy - // format is [CPerBlock*S*R,KPerBlock] +// blockwise wei copy +// format is [CPerBlock*S*R,KPerBlock] #if 0 const auto blockwise_wei_copy = Blockwise2dTensorCopy1{}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; #endif - // blockwise wei copy - // format is [CPerBlock*S*R,KPerBlock] +// blockwise wei copy +// format is [CPerBlock*S*R,KPerBlock] #if 0 const auto blockwise_wei_copy = Blockwise2dTensorCopy1{}; #endif - // weight: global mem to LDS, - // format is [S,R,CPerBlock,KPerBlock] +// weight: global mem to LDS, +// format is [S,R,CPerBlock,KPerBlock] #if 1 const auto blockwise_wei_copy = Blockwise4dTensorCopy1(in_transform_thread_block_desc, - p_in_transform_block + in_transform_block_desc.Get1dIndex( - n_thread_data_begin, - c_thread_data, - y_thread_data_begin * InTileSizeH, - x_thread_data_begin * InTileSizeW), - wei_transform_thread_block_desc, - p_wei_transform_block + wei_transform_block_desc.Get1dIndex( - k_thread_data_begin, c_thread_data, 0, 0), - out_transform_thread_desc, - p_out_transform_thread); + OutTileSizeW>( + in_transform_thread_block_desc, + p_in_transform_block + + in_transform_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + y_thread_data_begin * InTileSizeH, + x_thread_data_begin * InTileSizeW), + wei_transform_thread_block_desc, + p_wei_transform_block + + wei_transform_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_transform_thread_desc, + p_out_transform_thread); } }; diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index 09ac224007..d9e5c9c7c1 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim) return os; } -typedef enum -{ +typedef enum { Half = 0, Float = 1, } DataType_t;