mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
hip build
This commit is contained in:
@@ -5,6 +5,7 @@ project(modular_convolution)
|
||||
enable_language(CXX)
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
|
||||
|
||||
#boost
|
||||
@@ -16,7 +17,7 @@ message("Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}")
|
||||
include_directories(BEFORE ${Boost_INCLUDE_DIRS})
|
||||
link_directories(${Boost_LIBRARY_DIRS})
|
||||
|
||||
#openMP
|
||||
#OpenMP
|
||||
if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
|
||||
find_package(OpenMP REQUIRED)
|
||||
|
||||
@@ -30,11 +31,20 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
|
||||
link_libraries(${OpenMP_pthread_LIBRARY})
|
||||
endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
|
||||
|
||||
#cuda
|
||||
enable_language(CUDA)
|
||||
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
|
||||
#GPU backend
|
||||
if(DEVICE_BACKEND STREQUAL "HIP")
|
||||
set(DEVICE_BACKEND_HIP 1)
|
||||
|
||||
set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
|
||||
find_package(HIP REQUIRED)
|
||||
elseif(DEVICE_BACKEND STREQUAL "CUDA")
|
||||
set(DEVICE_BACKEND_CUDA 1)
|
||||
|
||||
enable_language(CUDA)
|
||||
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
|
||||
endif()
|
||||
|
||||
#
|
||||
include_directories(BEFORE src/include)
|
||||
include_directories(BEFORE src/include ${PROJECT_BINARY_DIR}/src/include)
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(driver)
|
||||
|
||||
16
build/cmake-hip.sh
Executable file
16
build/cmake-hip.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
rm -f CMakeCache.txt
|
||||
rm -f *.cmake
|
||||
rm -rf CMakeFiles
|
||||
|
||||
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
|
||||
MY_PROJECT_INSTALL=../install.dir
|
||||
|
||||
cmake \
|
||||
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D DEVICE_BACKEND="HIP" \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
${MY_PROJECT_SOURCE}
|
||||
@@ -1,2 +1,4 @@
|
||||
add_executable(conv conv.cu)
|
||||
target_link_libraries(conv tensor device)
|
||||
set(DRIVER_SOURCE driver.cpp)
|
||||
|
||||
add_executable(driver ${DRIVER_SOURCE})
|
||||
target_link_libraries(driver PRIVATE tensor)
|
||||
|
||||
@@ -54,39 +54,31 @@ void device_direct_convolution_1(InDesc,
|
||||
(out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
|
||||
(out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
|
||||
|
||||
dim3 block_dim(BlockSize);
|
||||
dim3 grid_dim(GridSize);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
const void* f = reinterpret_cast<const void*>(gridwise_direct_convolution_1<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
BlockSize,
|
||||
GridSize>);
|
||||
|
||||
T* in_dev_ptr = static_cast<T*>(in_device_buf.GetDeviceBuffer());
|
||||
T* wei_dev_ptr = static_cast<T*>(wei_device_buf.GetDeviceBuffer());
|
||||
T* out_dev_ptr = static_cast<T*>(out_device_buf.GetDeviceBuffer());
|
||||
|
||||
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
|
||||
|
||||
float time = 0;
|
||||
|
||||
launch_kernel(f, grid_dim, block_dim, args, time);
|
||||
float time = launch_kernel(gridwise_direct_convolution_1<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
BlockSize,
|
||||
GridSize>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -69,39 +69,31 @@ void device_direct_convolution_2(InDesc,
|
||||
(out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
|
||||
(out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
|
||||
|
||||
dim3 block_dim(BlockSize);
|
||||
dim3 grid_dim(GridSize);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
const void* f = reinterpret_cast<const void*>(gridwise_direct_convolution_2<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
BlockSize,
|
||||
GridSize>);
|
||||
|
||||
T* in_dev_ptr = static_cast<T*>(in_device_buf.GetDeviceBuffer());
|
||||
T* wei_dev_ptr = static_cast<T*>(wei_device_buf.GetDeviceBuffer());
|
||||
T* out_dev_ptr = static_cast<T*>(out_device_buf.GetDeviceBuffer());
|
||||
|
||||
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
|
||||
|
||||
float time = 0;
|
||||
|
||||
launch_kernel(f, grid_dim, block_dim, args, time);
|
||||
float time = launch_kernel(gridwise_direct_convolution_2<T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
BlockSize,
|
||||
GridSize>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -194,14 +194,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
|
||||
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
|
||||
|
||||
dim3 block_dim(BlockSize);
|
||||
dim3 grid_dim(GridSize);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
const void* f = reinterpret_cast<const void*>(
|
||||
float time = launch_kernel(
|
||||
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn<GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
@@ -221,17 +218,12 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
|
||||
WeiBlockCopyThreadPerDim0,
|
||||
WeiBlockCopyThreadPerDim1,
|
||||
InBlockCopyDataPerRead,
|
||||
WeiBlockCopyDataPerRead>);
|
||||
|
||||
T* in_dev_ptr = static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer());
|
||||
T* wei_dev_ptr = static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer());
|
||||
T* out_dev_ptr = static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer());
|
||||
|
||||
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
|
||||
|
||||
float time = 0;
|
||||
|
||||
launch_kernel(f, grid_dim, block_dim, args, time);
|
||||
WeiBlockCopyDataPerRead>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -94,7 +94,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
|
||||
constexpr unsigned WeiBlockCopyThreadPerDim1 = 1;
|
||||
|
||||
constexpr unsigned BlockSize = 8;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
|
||||
constexpr unsigned NPerBlock = 16;
|
||||
constexpr unsigned KPerBlock = 64;
|
||||
@@ -246,14 +246,11 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
|
||||
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
|
||||
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
|
||||
|
||||
dim3 block_dim(BlockSize);
|
||||
dim3 grid_dim(GridSize);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
const void* f = reinterpret_cast<const void*>(
|
||||
float time = launch_kernel(
|
||||
#if 0
|
||||
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
|
||||
#elif 1
|
||||
@@ -278,17 +275,13 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
WeiBlockCopyThreadPerDim0,
|
||||
WeiBlockCopyThreadPerDim1>);
|
||||
WeiBlockCopyThreadPerDim1>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
|
||||
T* in_dev_ptr = static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer());
|
||||
T* wei_dev_ptr = static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer());
|
||||
T* out_dev_ptr = static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer());
|
||||
|
||||
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
|
||||
|
||||
float time = 0;
|
||||
|
||||
launch_kernel(f, grid_dim, block_dim, args, time);
|
||||
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -52,39 +52,30 @@ void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc,
|
||||
(out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
|
||||
(out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
|
||||
|
||||
dim3 block_dim(BlockSize);
|
||||
dim3 grid_dim(GridSize);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
const void* f = reinterpret_cast<const void*>(
|
||||
gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw<GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread>);
|
||||
|
||||
T* in_dev_ptr = static_cast<T*>(in_device_buf.GetDeviceBuffer());
|
||||
T* wei_dev_ptr = static_cast<T*>(wei_device_buf.GetDeviceBuffer());
|
||||
T* out_dev_ptr = static_cast<T*>(out_device_buf.GetDeviceBuffer());
|
||||
|
||||
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
|
||||
|
||||
float time = 0;
|
||||
|
||||
launch_kernel(f, grid_dim, block_dim, args, time);
|
||||
float time = launch_kernel(gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw<GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
InDesc,
|
||||
WeiDesc,
|
||||
OutDesc,
|
||||
NPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -104,14 +104,11 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
|
||||
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
|
||||
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
|
||||
|
||||
dim3 block_dim(BlockSize);
|
||||
dim3 grid_dim(GridSize);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
const void* f = reinterpret_cast<const void*>(
|
||||
float time = launch_kernel(
|
||||
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw<GridSize,
|
||||
BlockSize,
|
||||
T,
|
||||
@@ -127,17 +124,12 @@ void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread>);
|
||||
|
||||
T* in_dev_ptr = static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer());
|
||||
T* wei_dev_ptr = static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer());
|
||||
T* out_dev_ptr = static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer());
|
||||
|
||||
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
|
||||
|
||||
float time = 0;
|
||||
|
||||
launch_kernel(f, grid_dim, block_dim, args, time);
|
||||
WoPerThread>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -195,9 +195,6 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
|
||||
constexpr unsigned GridSize =
|
||||
((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
|
||||
|
||||
dim3 block_dim(BlockSize);
|
||||
dim3 grid_dim(GridSize);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
// mem
|
||||
@@ -213,7 +210,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
const void* f = reinterpret_cast<const void*>(
|
||||
float time = launch_kernel(
|
||||
#if 0
|
||||
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
|
||||
#else
|
||||
@@ -244,17 +241,12 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
|
||||
WeiBlockCopyThreadPerDim0,
|
||||
WeiBlockCopyThreadPerDim1,
|
||||
InBlockCopyDataPerRead,
|
||||
WeiBlockCopyDataPerRead>);
|
||||
|
||||
T* in_dev_ptr = static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer());
|
||||
T* wei_dev_ptr = static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer());
|
||||
T* out_dev_ptr = static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer());
|
||||
|
||||
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
|
||||
|
||||
float time;
|
||||
|
||||
launch_kernel(f, grid_dim, block_dim, args, time);
|
||||
WeiBlockCopyDataPerRead>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -123,9 +123,6 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
|
||||
constexpr unsigned GridSize =
|
||||
((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
|
||||
|
||||
dim3 block_dim(BlockSize);
|
||||
dim3 grid_dim(GridSize);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
// mem
|
||||
@@ -141,7 +138,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
|
||||
|
||||
for(unsigned i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
const void* f = reinterpret_cast<const void*>(
|
||||
float time = launch_kernel(
|
||||
#if 1
|
||||
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
|
||||
#else
|
||||
@@ -162,17 +159,13 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
|
||||
GemmThreadPerColumnPerCluster,
|
||||
GemmThreadPerRowPerCluster,
|
||||
InBlockCopyThreadPerDim0,
|
||||
InBlockCopyThreadPerDim1>);
|
||||
InBlockCopyThreadPerDim1>,
|
||||
dim3(GridSize),
|
||||
dim3(BlockSize),
|
||||
|
||||
T* in_dev_ptr = static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer());
|
||||
T* wei_dev_ptr = static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer());
|
||||
T* out_dev_ptr = static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer());
|
||||
|
||||
void* args[] = {&in_dev_ptr, &wei_dev_ptr, &out_dev_ptr};
|
||||
|
||||
float time = 0;
|
||||
|
||||
launch_kernel(f, grid_dim, block_dim, args, time);
|
||||
static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(wei_srck_device_buf.GetDeviceBuffer()),
|
||||
static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer()));
|
||||
|
||||
printf("Elapsed time : %f ms\n", time);
|
||||
usleep(std::min(time * 1000, float(10000)));
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include "config.h"
|
||||
#include "tensor.hpp"
|
||||
#include "ConstantTensorDescriptor.cuh"
|
||||
#include "conv_common.cuh"
|
||||
@@ -49,7 +50,7 @@ struct GeneratorTensor_3
|
||||
std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
|
||||
std::vector<std::size_t> lens(sizeof...(Is), 100);
|
||||
std::vector<std::size_t> strides(sizeof...(Is), 1);
|
||||
std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1);
|
||||
std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1);
|
||||
return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
|
||||
#endif
|
||||
}
|
||||
@@ -339,7 +340,7 @@ void host_winograd_3x3_convolution(
|
||||
std::size_t ho = OutTileSizeH * y + j;
|
||||
for(int i = 0; i < OutTileSizeW; ++i)
|
||||
{
|
||||
std::size_t wo = OutTileSizeW * x + i;
|
||||
std::size_t wo = OutTileSizeW * x + i;
|
||||
out(n, k, ho, wo) = out_hold(n, k, y, x, j, i);
|
||||
}
|
||||
}
|
||||
@@ -392,13 +393,13 @@ int main()
|
||||
constexpr unsigned WPad = 0;
|
||||
#elif 0
|
||||
// 3x3, 34x34
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 256;
|
||||
constexpr unsigned N = 64;
|
||||
constexpr unsigned C = 256;
|
||||
constexpr unsigned HI = 34;
|
||||
constexpr unsigned WI = 34;
|
||||
constexpr unsigned K = 64;
|
||||
constexpr unsigned S = 3;
|
||||
constexpr unsigned R = 3;
|
||||
constexpr unsigned K = 64;
|
||||
constexpr unsigned S = 3;
|
||||
constexpr unsigned R = 3;
|
||||
|
||||
constexpr unsigned HPad = 0;
|
||||
constexpr unsigned WPad = 0;
|
||||
@@ -601,7 +602,7 @@ int main()
|
||||
#endif
|
||||
(in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
|
||||
|
||||
#elif 0
|
||||
#elif 1
|
||||
device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(in_nchw_desc,
|
||||
in_nchw,
|
||||
wei_kcsr_desc,
|
||||
@@ -1,21 +1,17 @@
|
||||
configure_file("${PROJECT_SOURCE_DIR}/src/include/config.h.in" "${PROJECT_BINARY_DIR}/src/include/config.h")
|
||||
|
||||
set(TENSOR_SOURCE
|
||||
tensor.cpp;
|
||||
device.cpp;
|
||||
)
|
||||
|
||||
add_library(tensor SHARED ${TENSOR_SOURCE})
|
||||
set_target_properties(tensor PROPERTIES PREFIX "")
|
||||
target_compile_features(tensor PUBLIC)
|
||||
set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
|
||||
if(DEVICE_BACKEND STREQUAL "CUDA")
|
||||
target_link_libraries(device nvToolsExt cudart)
|
||||
endif()
|
||||
|
||||
install(TARGETS tensor LIBRARY DESTINATION lib)
|
||||
|
||||
|
||||
set(DEVICE_SOURCE
|
||||
device.cu;
|
||||
)
|
||||
|
||||
add_library(device SHARED ${DEVICE_SOURCE})
|
||||
set_target_properties(device PROPERTIES PREFIX "")
|
||||
target_compile_features(device PUBLIC)
|
||||
set_target_properties(device PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device LIBRARY DESTINATION lib)
|
||||
target_link_libraries(device nvToolsExt cudart)
|
||||
|
||||
114
src/device.cpp
Normal file
114
src/device.cpp
Normal file
@@ -0,0 +1,114 @@
|
||||
#include "config.h"
|
||||
#include "device.hpp"
|
||||
|
||||
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
#endif
|
||||
}
|
||||
|
||||
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
|
||||
|
||||
void DeviceMem::ToDevice(const void* p)
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipGetErrorString(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
|
||||
void DeviceMem::FromDevice(void* p)
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
|
||||
#endif
|
||||
}
|
||||
|
||||
DeviceMem::~DeviceMem()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipGetErrorString(hipFree(mpDeviceBuf));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
checkCudaErrors(cudaFree(mpDeviceBuf));
|
||||
#endif
|
||||
}
|
||||
|
||||
struct KernelTimerImpl
|
||||
{
|
||||
KernelTimerImpl()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipEventCreate(&mStart);
|
||||
hipEventCreate(&mEnd);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
cudaEventCreate(&mStart);
|
||||
cudaEventCreate(&mEnd);
|
||||
#endif
|
||||
}
|
||||
|
||||
~KernelTimerImpl()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipEventDestroy(mStart);
|
||||
hipEventDestroy(mEnd);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
cudaEventDestroy(mStart);
|
||||
cudaEventDestroy(mEnd);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Start()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipEventRecord(mStart, 0);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
cudaEventRecord(mStart, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
void End()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipEventRecord(mEnd, 0);
|
||||
hipEventSynchronize(mEnd);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
cudaEventRecord(mEnd, 0);
|
||||
cudaEventSynchronize(mEnd);
|
||||
#endif
|
||||
}
|
||||
|
||||
float GetElapsedTime() const
|
||||
{
|
||||
float time;
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipEventElapsedTime(&time, mStart, mEnd);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
cudaEventElapsedTime(&time, mStart, mEnd);
|
||||
#endif
|
||||
return time;
|
||||
}
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
hipEvent_t mStart, mEnd;
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
cudaEvent_t mStart, mEnd;
|
||||
#endif
|
||||
};
|
||||
|
||||
KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
|
||||
|
||||
KernelTimer::~KernelTimer() {}
|
||||
|
||||
void KernelTimer::Start() { impl->Start(); }
|
||||
|
||||
void KernelTimer::End() { impl->End(); }
|
||||
|
||||
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
|
||||
@@ -1,79 +0,0 @@
|
||||
#include "device.hpp"
|
||||
#include "cuda_runtime.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "helper_cuda.h"
|
||||
|
||||
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
|
||||
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
|
||||
|
||||
void DeviceMem::ToDevice(const void* p)
|
||||
{
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void DeviceMem::FromDevice(void* p)
|
||||
{
|
||||
checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
DeviceMem::~DeviceMem() { checkCudaErrors(cudaFree(mpDeviceBuf)); }
|
||||
|
||||
struct KernelTimerImpl
|
||||
{
|
||||
KernelTimerImpl()
|
||||
{
|
||||
cudaEventCreate(&mStart);
|
||||
cudaEventCreate(&mEnd);
|
||||
}
|
||||
|
||||
~KernelTimerImpl()
|
||||
{
|
||||
cudaEventDestroy(mStart);
|
||||
cudaEventDestroy(mEnd);
|
||||
}
|
||||
|
||||
void Start() { cudaEventRecord(mStart, 0); }
|
||||
|
||||
void End()
|
||||
{
|
||||
cudaEventRecord(mEnd, 0);
|
||||
cudaEventSynchronize(mEnd);
|
||||
}
|
||||
|
||||
float GetElapsedTime() const
|
||||
{
|
||||
float time;
|
||||
cudaEventElapsedTime(&time, mStart, mEnd);
|
||||
return time;
|
||||
}
|
||||
|
||||
cudaEvent_t mStart, mEnd;
|
||||
};
|
||||
|
||||
KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
|
||||
|
||||
KernelTimer::~KernelTimer() {}
|
||||
|
||||
void KernelTimer::Start() { impl->Start(); }
|
||||
|
||||
void KernelTimer::End() { impl->End(); }
|
||||
|
||||
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
|
||||
|
||||
void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time)
|
||||
{
|
||||
KernelTimer timer;
|
||||
timer.Start();
|
||||
|
||||
cudaError_t error = cudaLaunchKernel(func, grid_dim, block_dim, args, 0, 0);
|
||||
|
||||
timer.End();
|
||||
time = timer.GetElapsedTime();
|
||||
|
||||
checkCudaErrors(error);
|
||||
}
|
||||
@@ -245,10 +245,11 @@ struct BlockwiseChwnTensorCopyPadded
|
||||
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
|
||||
|
||||
const Float* p_src_tmp =
|
||||
p_src + src_desc.Get1dIndex(c_block_data_begin,
|
||||
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
|
||||
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
|
||||
n_block_data_begin);
|
||||
p_src +
|
||||
src_desc.Get1dIndex(c_block_data_begin,
|
||||
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
|
||||
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
|
||||
n_block_data_begin);
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0)
|
||||
|
||||
@@ -95,10 +95,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
Float p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
threadwise_4d_tensor_copy(out_block_desc,
|
||||
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
wo_thread_data_begin),
|
||||
p_out_block +
|
||||
out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
wo_thread_data_begin),
|
||||
out_thread_desc,
|
||||
p_out_thread,
|
||||
out_thread_desc.GetLengths());
|
||||
@@ -109,10 +110,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
// threadwise convolution
|
||||
threadwise_direct_convolution_2(
|
||||
in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data_begin,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
p_in_block +
|
||||
in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data_begin,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block +
|
||||
wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
|
||||
@@ -124,10 +126,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
threadwise_4d_tensor_copy(out_thread_desc,
|
||||
p_out_thread,
|
||||
out_block_desc,
|
||||
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
wo_thread_data_begin),
|
||||
p_out_block +
|
||||
out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
wo_thread_data_begin),
|
||||
out_thread_desc.GetLengths());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -305,8 +305,9 @@ struct BlockwiseGemmBlockABlockBThreadC
|
||||
constexpr unsigned NClusterWork =
|
||||
(NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
|
||||
|
||||
static_assert(BlockSize == (MClusterWork * MThreadPerCluster) *
|
||||
(NClusterWork * NThreadPerCluster),
|
||||
static_assert(BlockSize ==
|
||||
(MClusterWork * MThreadPerCluster) *
|
||||
(NClusterWork * NThreadPerCluster),
|
||||
"wrong! wrong BlockSize");
|
||||
|
||||
if(DistributeThreadAlongColumnFirst)
|
||||
@@ -685,7 +686,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
|
||||
constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
|
||||
|
||||
// preload A, B
|
||||
// preload A, B
|
||||
#pragma unroll
|
||||
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
|
||||
{ // copy A-sub to form A
|
||||
@@ -718,7 +719,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
FloatA* p_a_thread_next = even_loop ? p_a_thread_1 : p_a_thread_0;
|
||||
FloatB* p_b_thread_next = even_loop ? p_b_thread_1 : p_b_thread_0;
|
||||
|
||||
// preload next A, B
|
||||
// preload next A, B
|
||||
#pragma unroll
|
||||
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
|
||||
{ // copy A-sub to form A
|
||||
@@ -906,8 +907,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
|
||||
c_thread_sub_mtx,
|
||||
False,
|
||||
p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
|
||||
n_repeat * NPerThreadSubC),
|
||||
p_c_thread +
|
||||
c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
|
||||
n_repeat * NPerThreadSubC),
|
||||
f_accum);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,4 +62,4 @@ struct Sequence
|
||||
printf("Sequence::ReorderByPutOldToNew not implemented");
|
||||
assert(false);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
11
src/include/config.h.in
Normal file
11
src/include/config.h.in
Normal file
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
#cmakedefine01 DEVICE_BACKEND_HIP
|
||||
#cmakedefine01 DEVICE_BACKEND_CUDA
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#include "hip/hip_runtime.h"
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#include "cuda_runtime.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "helper_cuda.h"
|
||||
#endif
|
||||
@@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include "config.h"
|
||||
|
||||
struct DeviceMem
|
||||
{
|
||||
@@ -27,4 +28,31 @@ struct KernelTimer
|
||||
std::unique_ptr<KernelTimerImpl> impl;
|
||||
};
|
||||
|
||||
void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time);
|
||||
template <typename... Args, typename F>
|
||||
float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args)
|
||||
{
|
||||
KernelTimer timer;
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
timer.Start();
|
||||
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, 0, 0, args...);
|
||||
|
||||
timer.End();
|
||||
|
||||
hipGetErrorString(hipGetLastError());
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
const void* f = reinterpret_cast<const void*>(kernel);
|
||||
void* p_args = {&args...};
|
||||
|
||||
timer.Start();
|
||||
|
||||
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, 0, 0);
|
||||
|
||||
timer.End();
|
||||
|
||||
checkCudaErrors(error);
|
||||
#endif
|
||||
|
||||
return timer.GetElapsedTime();
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.cuh"
|
||||
#include "ConstantTensorDescriptor.cuh"
|
||||
#include "blockwise_4d_tensor_op.cuh"
|
||||
#include "blockwise_direct_convolution.cuh"
|
||||
@@ -146,10 +147,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
|
||||
c_block_work_begin += CPerBlock)
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin,
|
||||
c_block_work_begin,
|
||||
hi_block_work_begin,
|
||||
wi_block_work_begin),
|
||||
blockwise_in_copy.Run(p_in_global +
|
||||
in_global_desc.Get1dIndex(n_block_work_begin,
|
||||
c_block_work_begin,
|
||||
hi_block_work_begin,
|
||||
wi_block_work_begin),
|
||||
p_in_block);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
@@ -176,9 +178,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
|
||||
}
|
||||
|
||||
// copy output tensor from LDS to device mem
|
||||
blockwise_out_copy.Run(p_out_block,
|
||||
p_out_global + out_global_desc.Get1dIndex(n_block_work_begin,
|
||||
k_block_work_begin,
|
||||
ho_block_work_begin,
|
||||
wo_block_work_begin));
|
||||
blockwise_out_copy.Run(
|
||||
p_out_block,
|
||||
p_out_global +
|
||||
out_global_desc.Get1dIndex(
|
||||
n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.cuh"
|
||||
#include "ConstantTensorDescriptor.cuh"
|
||||
#include "blockwise_4d_tensor_op.cuh"
|
||||
#include "blockwise_direct_convolution.cuh"
|
||||
@@ -162,10 +163,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
blockwise_in_copy.Run(p_in_global +
|
||||
in_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_block);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
@@ -177,14 +179,15 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
|
||||
|
||||
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
|
||||
{
|
||||
// threadwise convolution
|
||||
// threadwise convolution
|
||||
#if 1
|
||||
threadwise_direct_convolution_2(
|
||||
in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
p_in_block +
|
||||
in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_thread_desc,
|
||||
@@ -192,10 +195,11 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
|
||||
#elif 0
|
||||
threadwise_direct_convolution_3(
|
||||
in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
p_in_block +
|
||||
in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_thread_desc,
|
||||
@@ -209,9 +213,10 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
|
||||
out_thread_desc,
|
||||
p_out_thread,
|
||||
out_global_desc,
|
||||
p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
p_out_global +
|
||||
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_thread_desc.GetLengths());
|
||||
}
|
||||
|
||||
@@ -121,8 +121,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
|
||||
decltype(in_chwn_block_desc),
|
||||
decltype(in_chwn_block_desc.GetLengths())>{};
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*S*R,KPerBlock]
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*S*R,KPerBlock]
|
||||
#if 0
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
@@ -199,8 +199,9 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
|
||||
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
|
||||
|
||||
const Float* p_in_global_block_begin =
|
||||
p_in_global + in_chwn_global_desc.Get1dIndex(
|
||||
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
|
||||
p_in_global +
|
||||
in_chwn_global_desc.Get1dIndex(
|
||||
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_begin =
|
||||
p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
@@ -257,10 +258,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
|
||||
out_hkwn_thread_desc,
|
||||
p_out_thread,
|
||||
out_khwn_global_desc,
|
||||
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
p_out_global +
|
||||
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_hkwn_thread_desc.GetLengths(),
|
||||
reorder_khwn_from_hkwn);
|
||||
}
|
||||
|
||||
@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
|
||||
out_hkwn_thread_desc,
|
||||
p_out_thread,
|
||||
out_khwn_global_desc,
|
||||
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
p_out_global +
|
||||
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_hkwn_thread_desc.GetLengths(),
|
||||
reorder_khwn_from_hkwn);
|
||||
}
|
||||
|
||||
@@ -256,7 +256,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
|
||||
Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0;
|
||||
Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0;
|
||||
|
||||
// preload next data
|
||||
// preload next data
|
||||
#if 1
|
||||
// input: global mem to LDS,
|
||||
blockwise_in_copy.Run(p_in_global,
|
||||
@@ -339,10 +339,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
|
||||
out_hkwn_thread_desc,
|
||||
p_out_thread,
|
||||
out_khwn_global_desc,
|
||||
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
p_out_global +
|
||||
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_hkwn_thread_desc.GetLengths(),
|
||||
reorder_khwn_from_hkwn);
|
||||
}
|
||||
|
||||
@@ -160,10 +160,11 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
|
||||
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
|
||||
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
|
||||
in_nchw_global_desc,
|
||||
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_global +
|
||||
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
in_chwn_block_desc,
|
||||
p_in_block,
|
||||
in_nchw_block_desc.GetLengths(),
|
||||
@@ -244,10 +245,11 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
|
||||
out_hkwn_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_hkwn_thread_desc.GetLengths(),
|
||||
reorder_nkhw_from_hkwn);
|
||||
#else
|
||||
@@ -261,10 +263,11 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_nkhw_thread_desc.GetLengths());
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -166,10 +166,11 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
|
||||
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
|
||||
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
|
||||
in_nchw_global_desc,
|
||||
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_global +
|
||||
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
in_chwn_block_desc,
|
||||
p_in_block,
|
||||
in_nchw_block_desc.GetLengths(),
|
||||
@@ -179,9 +180,10 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
|
||||
#if 1
|
||||
// weight: global mem to LDS,
|
||||
// format is [S,R,C,K], no conversion needed
|
||||
blockwise_wei_copy.Run(p_wei_global + wei_srck_global_desc.Get1dIndex(
|
||||
0, 0, c_block_data_begin, k_block_data_begin),
|
||||
p_wei_block);
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_global +
|
||||
wei_srck_global_desc.Get1dIndex(0, 0, c_block_data_begin, k_block_data_begin),
|
||||
p_wei_block);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
@@ -217,10 +219,11 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
|
||||
out_hkwn_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_hkwn_thread_desc.GetLengths(),
|
||||
reorder_nkhw_from_hkwn);
|
||||
}
|
||||
|
||||
@@ -111,8 +111,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
|
||||
}
|
||||
#endif
|
||||
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
#if 0
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths())>{};
|
||||
#elif 0
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
|
||||
InBlockCopyThreadPerDim0,
|
||||
InBlockCopyThreadPerDim1>{};
|
||||
#elif 1
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
@@ -137,8 +137,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
|
||||
InBlockCopyDataPerRead>{};
|
||||
#endif
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*S*R,KPerBlock]
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*S*R,KPerBlock]
|
||||
#if 0
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
|
||||
@@ -111,8 +111,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
|
||||
}
|
||||
#endif
|
||||
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
#if 0
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths())>{};
|
||||
#elif 0
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
|
||||
InBlockCopyThreadPerDim0,
|
||||
InBlockCopyThreadPerDim1>{};
|
||||
#elif 1
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
|
||||
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
@@ -137,8 +137,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
|
||||
InBlockCopyDataPerRead>{};
|
||||
#endif
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*S*R,KPerBlock]
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*S*R,KPerBlock]
|
||||
#if 0
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
@@ -258,7 +258,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// load next data
|
||||
// load next data
|
||||
#if 0
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
|
||||
|
||||
@@ -103,8 +103,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(const Float* const __restric
|
||||
}
|
||||
#endif
|
||||
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
#if 0
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
|
||||
@@ -103,8 +103,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
|
||||
}
|
||||
#endif
|
||||
|
||||
// in: global mem to LDS
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
// in: global mem to LDS
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
#if 1
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
@@ -129,8 +129,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
|
||||
decltype(in_cb_block_desc.GetLengths())>{};
|
||||
#endif
|
||||
|
||||
// weight: global mem to LDS,
|
||||
// format is [S,R,CPerBlock,KPerBlock]
|
||||
// weight: global mem to LDS,
|
||||
// format is [S,R,CPerBlock,KPerBlock]
|
||||
#if 1
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
@@ -191,7 +191,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
|
||||
|
||||
// prelog: load data
|
||||
// prelog: load data
|
||||
#if 1
|
||||
// input: global mem to LDS,
|
||||
blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin),
|
||||
@@ -220,9 +220,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
|
||||
#if 1
|
||||
// preload next data
|
||||
// input: global mem to LDS,
|
||||
blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex(
|
||||
c_block_data_begin + CPerBlock, b_block_data_begin),
|
||||
p_in_block_next);
|
||||
blockwise_in_copy.Run(
|
||||
p_in_global +
|
||||
in_cb_global_desc.Get1dIndex(c_block_data_begin + CPerBlock, b_block_data_begin),
|
||||
p_in_block_next);
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
|
||||
@@ -189,17 +189,18 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_
|
||||
S,
|
||||
R,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW>(in_transform_thread_block_desc,
|
||||
p_in_transform_block + in_transform_block_desc.Get1dIndex(
|
||||
n_thread_data_begin,
|
||||
c_thread_data,
|
||||
y_thread_data_begin * InTileSizeH,
|
||||
x_thread_data_begin * InTileSizeW),
|
||||
wei_transform_thread_block_desc,
|
||||
p_wei_transform_block + wei_transform_block_desc.Get1dIndex(
|
||||
k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_transform_thread_desc,
|
||||
p_out_transform_thread);
|
||||
OutTileSizeW>(
|
||||
in_transform_thread_block_desc,
|
||||
p_in_transform_block +
|
||||
in_transform_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
y_thread_data_begin * InTileSizeH,
|
||||
x_thread_data_begin * InTileSizeW),
|
||||
wei_transform_thread_block_desc,
|
||||
p_wei_transform_block +
|
||||
wei_transform_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_transform_thread_desc,
|
||||
p_out_transform_thread);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
|
||||
return os;
|
||||
}
|
||||
|
||||
typedef enum
|
||||
{
|
||||
typedef enum {
|
||||
Half = 0,
|
||||
Float = 1,
|
||||
} DataType_t;
|
||||
|
||||
Reference in New Issue
Block a user