From b2888adfbe103ae3d9006af87d5871b69cbf00ba Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 15 Feb 2019 02:13:21 -0600 Subject: [PATCH] change file extension to hip.hpp and hip.cpp --- ..._1.cuh => device_direct_convolution_1.hpp} | 2 +- ..._2.cuh => device_direct_convolution_2.hpp} | 2 +- ...cit_gemm_convolution_1_chwn_csrk_khwn.hpp} | 2 +- ...m_convolution_1_chwn_csrk_khwn_padded.hpp} | 4 +- ...cit_gemm_convolution_1_nchw_kcsr_nkhw.hpp} | 2 +- ...cit_gemm_convolution_1_nchw_srck_nkhw.hpp} | 2 +- ...cit_gemm_convolution_2_cnhw_csrk_knhw.hpp} | 4 +- ...cit_gemm_convolution_2_cnhw_srck_knhw.hpp} | 4 +- driver/driver.hip.cpp | 22 ++++----- ...r.cuh => ConstantMatrixDescriptor.hip.hpp} | 2 +- ...r.cuh => ConstantTensorDescriptor.hip.hpp} | 2 +- ..._op.cuh => blockwise_2d_tensor_op.hip.hpp} | 2 +- ..._op.cuh => blockwise_4d_tensor_op.hip.hpp} | 11 ++--- ...h => blockwise_direct_convolution.hip.hpp} | 33 ++++++------- ...ckwise_gemm.cuh => blockwise_gemm.hip.hpp} | 12 ++--- src/include/{common.cuh => common.hip.hpp} | 0 .../{conv_common.cuh => conv_common.hip.hpp} | 2 +- src/include/device.hpp | 4 +- ... => gridwise_direct_convolution_1.hip.hpp} | 27 +++++------ ... => gridwise_direct_convolution_2.hip.hpp} | 48 +++++++++---------- ...gemm_convolution_1_chwn_csrk_khwn.hip.hpp} | 28 +++++------ ...nvolution_1_chwn_csrk_khwn_padded.hip.hpp} | 23 +++++---- ...hwn_csrk_khwn_padded_lds_pipeline.hip.hpp} | 23 +++++---- ...gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp} | 39 +++++++-------- ...gemm_convolution_1_nchw_srck_nkhw.hip.hpp} | 37 +++++++------- ...gemm_convolution_2_cnhw_csrk_knhw.hip.hpp} | 18 +++---- ..._cnhw_csrk_knhw_lds_double_buffer.hip.hpp} | 18 +++---- ...gemm_convolution_2_cnhw_srck_knhw.hip.hpp} | 14 +++--- ...ion_2_cnhw_srck_knhw_lds_pipeline.hip.hpp} | 21 ++++---- ... => gridwise_winograd_convolution.hip.hpp} | 29 ++++++----- src/include/tensor.hpp | 3 +- ...op.cuh => threadwise_2d_tensor_op.hip.hpp} | 2 +- ...op.cuh => threadwise_4d_tensor_op.hip.hpp} | 2 +- ... => threadwise_direct_convolution.hip.hpp} | 2 +- ...dwise_gemm.cuh => threadwise_gemm.hip.hpp} | 0 35 files changed, 212 insertions(+), 234 deletions(-) rename driver/{device_direct_convolution_1.cuh => device_direct_convolution_1.hpp} (98%) rename driver/{device_direct_convolution_2.cuh => device_direct_convolution_2.hpp} (98%) rename driver/{device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh => device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp} (99%) rename driver/{device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh => device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp} (99%) rename driver/{device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh => device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp} (98%) rename driver/{device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh => device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp} (98%) rename driver/{device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh => device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp} (99%) rename driver/{device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh => device_implicit_gemm_convolution_2_cnhw_srck_knhw.hpp} (98%) rename src/include/{ConstantMatrixDescriptor.cuh => ConstantMatrixDescriptor.hip.hpp} (98%) rename src/include/{ConstantTensorDescriptor.cuh => ConstantTensorDescriptor.hip.hpp} (99%) rename src/include/{blockwise_2d_tensor_op.cuh => blockwise_2d_tensor_op.hip.hpp} (99%) rename src/include/{blockwise_4d_tensor_op.cuh => blockwise_4d_tensor_op.hip.hpp} (96%) rename src/include/{blockwise_direct_convolution.cuh => blockwise_direct_convolution.hip.hpp} (87%) rename src/include/{blockwise_gemm.cuh => blockwise_gemm.hip.hpp} (98%) rename src/include/{common.cuh => common.hip.hpp} (100%) rename src/include/{conv_common.cuh => conv_common.hip.hpp} (98%) rename src/include/{gridwise_direct_convolution_1.cuh => gridwise_direct_convolution_1.hip.hpp} (89%) rename src/include/{gridwise_direct_convolution_2.cuh => gridwise_direct_convolution_2.hip.hpp} (83%) rename src/include/{gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh => gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp} (93%) rename src/include/{gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh => gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp} (94%) rename src/include/{gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh => gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp} (95%) rename src/include/{gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh => gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp} (89%) rename src/include/{gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh => gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp} (88%) rename src/include/{gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh => gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp} (97%) rename src/include/{gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh => gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp} (97%) rename src/include/{gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh => gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp} (97%) rename src/include/{gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh => gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp} (96%) rename src/include/{gridwise_winograd_convolution.cuh => gridwise_winograd_convolution.hip.hpp} (90%) rename src/include/{threadwise_2d_tensor_op.cuh => threadwise_2d_tensor_op.hip.hpp} (99%) rename src/include/{threadwise_4d_tensor_op.cuh => threadwise_4d_tensor_op.hip.hpp} (99%) rename src/include/{threadwise_direct_convolution.cuh => threadwise_direct_convolution.hip.hpp} (99%) rename src/include/{threadwise_gemm.cuh => threadwise_gemm.hip.hpp} (100%) diff --git a/driver/device_direct_convolution_1.cuh b/driver/device_direct_convolution_1.hpp similarity index 98% rename from driver/device_direct_convolution_1.cuh rename to driver/device_direct_convolution_1.hpp index 9a3e9c32d2..99b184ec7c 100644 --- a/driver/device_direct_convolution_1.cuh +++ b/driver/device_direct_convolution_1.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_direct_convolution_1.cuh" +#include "gridwise_direct_convolution_1.hip.hpp" template void device_direct_convolution_1(InDesc, diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.hpp similarity index 98% rename from driver/device_direct_convolution_2.cuh rename to driver/device_direct_convolution_2.hpp index cfc88e45c7..f627719026 100644 --- a/driver/device_direct_convolution_2.cuh +++ b/driver/device_direct_convolution_2.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_direct_convolution_2.cuh" +#include "gridwise_direct_convolution_2.hip.hpp" template void device_direct_convolution_2(InDesc, diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp similarity index 99% rename from driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh rename to driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp index fb36282dbc..fb0ae4a8cd 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" +#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp" template void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp similarity index 99% rename from driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh rename to driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp index f755c36e94..0c38e5206f 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp @@ -1,8 +1,8 @@ #pragma once #include #include "device.hpp" -#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" -#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh" +#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp" +#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp" template void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc, diff --git a/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh b/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp similarity index 98% rename from driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh rename to driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp index e78f3b0660..ea7dbea266 100644 --- a/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh +++ b/driver/device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh" +#include "gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp" template void device_implicit_gemm_convolution_1_nchw_kcsr_nkhw(InDesc, diff --git a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp similarity index 98% rename from driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh rename to driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp index 7f4f139d39..166d392e5f 100644 --- a/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/driver/device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" +#include "gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp" template void device_implicit_gemm_convolution_1_nchw_srck_nkhw(InDesc, diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp similarity index 99% rename from driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh rename to driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp index 34f4745501..870d808bc9 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp @@ -1,8 +1,8 @@ #pragma once #include #include "device.hpp" -#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh" -#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh" +#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp" +#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp" template void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.hpp similarity index 98% rename from driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh rename to driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.hpp index d0bc18dbba..2ede247ff0 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.hpp @@ -1,8 +1,8 @@ #pragma once #include #include "device.hpp" -#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" -#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh" +#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp" +#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp" template void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 8b5a71a7e5..7b669f47d0 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -4,17 +4,17 @@ #include #include "config.h" #include "tensor.hpp" -#include "ConstantTensorDescriptor.cuh" -#include "conv_common.cuh" -#include "device_direct_convolution_1.cuh" -#include "device_direct_convolution_2.cuh" -#include "device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh" -#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" -#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" -#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh" -#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" -#include "device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh" -//#include "device_winograd_convolution.cuh" +#include "ConstantTensorDescriptor.hip.hpp" +#include "conv_common.hip.hpp" +#include "device_direct_convolution_1.hpp" +#include "device_direct_convolution_2.hpp" +#include "device_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hpp" +#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.hpp" +#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp" +#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hpp" +#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.hpp" +#include "device_implicit_gemm_convolution_2_cnhw_csrk_knhw.hpp" +//#include "device_winograd_convolution.hip.hpp" struct GeneratorTensor_1 { diff --git a/src/include/ConstantMatrixDescriptor.cuh b/src/include/ConstantMatrixDescriptor.hip.hpp similarity index 98% rename from src/include/ConstantMatrixDescriptor.cuh rename to src/include/ConstantMatrixDescriptor.hip.hpp index bf141eb4f0..d014e93574 100644 --- a/src/include/ConstantMatrixDescriptor.cuh +++ b/src/include/ConstantMatrixDescriptor.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "common.cuh" +#include "common.hip.hpp" template struct ConstantMatrixDescriptor diff --git a/src/include/ConstantTensorDescriptor.cuh b/src/include/ConstantTensorDescriptor.hip.hpp similarity index 99% rename from src/include/ConstantTensorDescriptor.cuh rename to src/include/ConstantTensorDescriptor.hip.hpp index 90bf761a94..c3157653d2 100644 --- a/src/include/ConstantTensorDescriptor.cuh +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "common.cuh" +#include "common.hip.hpp" // this is ugly, only for 2d template diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.hip.hpp similarity index 99% rename from src/include/blockwise_2d_tensor_op.cuh rename to src/include/blockwise_2d_tensor_op.hip.hpp index d1463bd807..a90007e246 100644 --- a/src/include/blockwise_2d_tensor_op.cuh +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "ConstantTensorDescriptor.cuh" +#include "ConstantTensorDescriptor.hip.hpp" template __device__ void diff --git a/src/include/blockwise_4d_tensor_op.cuh b/src/include/blockwise_4d_tensor_op.hip.hpp similarity index 96% rename from src/include/blockwise_4d_tensor_op.cuh rename to src/include/blockwise_4d_tensor_op.hip.hpp index 693a0e4abe..b81063fed5 100644 --- a/src/include/blockwise_4d_tensor_op.cuh +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "ConstantTensorDescriptor.cuh" +#include "ConstantTensorDescriptor.hip.hpp" template __device__ void @@ -245,11 +245,10 @@ struct BlockwiseChwnTensorCopyPadded constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; const Float* p_src_tmp = - p_src + - src_desc.Get1dIndex(c_block_data_begin, - (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, - (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, - n_block_data_begin); + p_src + src_desc.Get1dIndex(c_block_data_begin, + (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, + (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, + n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.hip.hpp similarity index 87% rename from src/include/blockwise_direct_convolution.cuh rename to src/include/blockwise_direct_convolution.hip.hpp index 48856ffba7..c6f7965de0 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.hip.hpp @@ -1,7 +1,7 @@ #pragma once -#include "ConstantTensorDescriptor.cuh" -#include "threadwise_4d_tensor_op.cuh" -#include "threadwise_direct_convolution.cuh" +#include "ConstantTensorDescriptor.hip.hpp" +#include "threadwise_4d_tensor_op.hip.hpp" +#include "threadwise_direct_convolution.hip.hpp" template diff --git a/src/include/device.hpp b/src/include/device.hpp index 3d88454146..eec7dd5395 100644 --- a/src/include/device.hpp +++ b/src/include/device.hpp @@ -42,8 +42,8 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args) hipGetErrorString(hipGetLastError()); #elif DEVICE_BACKEND_CUDA - const void* f = reinterpret_cast(kernel); - void* p_args[] = {&args...}; + const void* f = reinterpret_cast(kernel); + void* p_args[] = {&args...}; timer.Start(); diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.hip.hpp similarity index 89% rename from src/include/gridwise_direct_convolution_1.cuh rename to src/include/gridwise_direct_convolution_1.hip.hpp index 443924fa26..49129b24d3 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.hip.hpp @@ -1,8 +1,8 @@ #pragma once -#include "common.cuh" -#include "ConstantTensorDescriptor.cuh" -#include "blockwise_4d_tensor_op.cuh" -#include "blockwise_direct_convolution.cuh" +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_direct_convolution.hip.hpp" template ( in_nchw_global_desc, - p_in_global + - in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), in_chwn_block_desc, p_in_block, in_nchw_block_desc.GetLengths(), @@ -245,11 +244,10 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric out_hkwn_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + - out_nkhw_global_desc.Get1dIndex(n_block_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_nkhw_from_hkwn); #else @@ -263,11 +261,10 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + - out_nkhw_global_desc.Get1dIndex(n_block_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); #endif } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp similarity index 88% rename from src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh rename to src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp index 691675cd74..0ea28e9ac2 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp @@ -1,10 +1,10 @@ #pragma once -#include "common.cuh" -#include "ConstantTensorDescriptor.cuh" -#include "ConstantMatrixDescriptor.cuh" -#include "blockwise_4d_tensor_op.cuh" -#include "threadwise_4d_tensor_op.cuh" -#include "blockwise_gemm.cuh" +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "ConstantMatrixDescriptor.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "threadwise_4d_tensor_op.hip.hpp" +#include "blockwise_gemm.hip.hpp" template ( in_nchw_global_desc, - p_in_global + - in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), in_chwn_block_desc, p_in_block, in_nchw_block_desc.GetLengths(), @@ -180,10 +179,9 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric #if 1 // weight: global mem to LDS, // format is [S,R,C,K], no conversion needed - blockwise_wei_copy.Run( - p_wei_global + - wei_srck_global_desc.Get1dIndex(0, 0, c_block_data_begin, k_block_data_begin), - p_wei_block); + blockwise_wei_copy.Run(p_wei_global + wei_srck_global_desc.Get1dIndex( + 0, 0, c_block_data_begin, k_block_data_begin), + p_wei_block); #endif __syncthreads(); @@ -219,11 +217,10 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric out_hkwn_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + - out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_nkhw_from_hkwn); } diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp similarity index 97% rename from src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh rename to src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp index 5aec794a83..7427673d91 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp @@ -1,11 +1,11 @@ #pragma once -#include "common.cuh" -#include "ConstantTensorDescriptor.cuh" -#include "ConstantMatrixDescriptor.cuh" -#include "blockwise_4d_tensor_op.cuh" -#include "blockwise_2d_tensor_op.cuh" -#include "threadwise_2d_tensor_op.cuh" -#include "blockwise_gemm.cuh" +#include "common.hip.hpp" +#include "ConstantTensorDescriptor.hip.hpp" +#include "ConstantMatrixDescriptor.hip.hpp" +#include "blockwise_4d_tensor_op.hip.hpp" +#include "blockwise_2d_tensor_op.hip.hpp" +#include "threadwise_2d_tensor_op.hip.hpp" +#include "blockwise_gemm.hip.hpp" // define B = flatten(N, Hi, Wi) template {}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; #elif 0 - const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = Blockwise2dTensorCopy3( - in_transform_thread_block_desc, - p_in_transform_block + - in_transform_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - y_thread_data_begin * InTileSizeH, - x_thread_data_begin * InTileSizeW), - wei_transform_thread_block_desc, - p_wei_transform_block + - wei_transform_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_transform_thread_desc, - p_out_transform_thread); + OutTileSizeW>(in_transform_thread_block_desc, + p_in_transform_block + in_transform_block_desc.Get1dIndex( + n_thread_data_begin, + c_thread_data, + y_thread_data_begin * InTileSizeH, + x_thread_data_begin * InTileSizeW), + wei_transform_thread_block_desc, + p_wei_transform_block + wei_transform_block_desc.Get1dIndex( + k_thread_data_begin, c_thread_data, 0, 0), + out_transform_thread_desc, + p_out_transform_thread); } }; diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index d9e5c9c7c1..09ac224007 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -22,7 +22,8 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim) return os; } -typedef enum { +typedef enum +{ Half = 0, Float = 1, } DataType_t; diff --git a/src/include/threadwise_2d_tensor_op.cuh b/src/include/threadwise_2d_tensor_op.hip.hpp similarity index 99% rename from src/include/threadwise_2d_tensor_op.cuh rename to src/include/threadwise_2d_tensor_op.hip.hpp index ec7a759dce..cc48e88317 100644 --- a/src/include/threadwise_2d_tensor_op.cuh +++ b/src/include/threadwise_2d_tensor_op.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "ConstantTensorDescriptor.cuh" +#include "ConstantTensorDescriptor.hip.hpp" template __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __restrict__ p, F f) diff --git a/src/include/threadwise_4d_tensor_op.cuh b/src/include/threadwise_4d_tensor_op.hip.hpp similarity index 99% rename from src/include/threadwise_4d_tensor_op.cuh rename to src/include/threadwise_4d_tensor_op.hip.hpp index d715718af9..6cf413187f 100644 --- a/src/include/threadwise_4d_tensor_op.cuh +++ b/src/include/threadwise_4d_tensor_op.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "ConstantTensorDescriptor.cuh" +#include "ConstantTensorDescriptor.hip.hpp" template __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __restrict__ p, F f) diff --git a/src/include/threadwise_direct_convolution.cuh b/src/include/threadwise_direct_convolution.hip.hpp similarity index 99% rename from src/include/threadwise_direct_convolution.cuh rename to src/include/threadwise_direct_convolution.hip.hpp index 4801cad8c3..04ea7b1506 100644 --- a/src/include/threadwise_direct_convolution.cuh +++ b/src/include/threadwise_direct_convolution.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "ConstantTensorDescriptor.cuh" +#include "ConstantTensorDescriptor.hip.hpp" // optimized for scenario if p_in, p_wei, p_out are in register template diff --git a/src/include/threadwise_gemm.cuh b/src/include/threadwise_gemm.hip.hpp similarity index 100% rename from src/include/threadwise_gemm.cuh rename to src/include/threadwise_gemm.hip.hpp