diff --git a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp index 7e20cccac5..790bcfbb9a 100644 --- a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp +++ b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp @@ -1,8 +1,8 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hip.hpp" -#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp" +#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp" template void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc, diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 217eb853d9..05e85f5bfb 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -1,11 +1,11 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp" +#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp" +#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp" template void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp index 3237a7310b..cebc92f907 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp @@ -1,10 +1,10 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp" +#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp" template void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index acd8176023..43c8512b87 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -1,9 +1,9 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp" +#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp" +#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp" template void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, diff --git a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp index 54e777dbe4..8033d32b53 100644 --- a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp @@ -1,9 +1,9 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp" +#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" +#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp" template void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, diff --git a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp index b3b6d785bf..17feafef95 100644 --- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp @@ -1,9 +1,9 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp" +#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" +#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp" template void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, diff --git a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp index 618d330534..0704ddc867 100644 --- a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp @@ -1,9 +1,9 @@ #pragma once #include #include "device.hpp" -#include "gridwise_convolution_wrapper.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp" -#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp" +#include "gridwise_convolution_wrapper.hpp" +#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp" +#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp" template void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index 7790900f83..cddbb4df3c 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" +#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" template void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp index dc0bb2f7e1..f8ed51516d 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp +++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp @@ -1,7 +1,7 @@ #pragma once #include #include "device.hpp" -#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp" +#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" template void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc, diff --git a/driver/driver.hip.cpp b/driver/driver.cpp similarity index 99% rename from driver/driver.hip.cpp rename to driver/driver.cpp index 40cd4fdd3f..ededb0d87f 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.cpp @@ -5,8 +5,8 @@ #include #include "config.h" #include "tensor.hpp" -#include "ConstantTensorDescriptor.hip.hpp" -#include "conv_common.hip.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "conv_common.hpp" #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" //#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" diff --git a/driver/driver.cu b/driver/driver.cu index 974d4de85e..1ca4fea9d7 120000 --- a/driver/driver.cu +++ b/driver/driver.cu @@ -1 +1 @@ -driver.hip.cpp \ No newline at end of file +driver.cpp \ No newline at end of file diff --git a/src/include/Array.hip.hpp b/src/include/Array.hpp similarity index 99% rename from src/include/Array.hip.hpp rename to src/include/Array.hpp index 5e00d32a09..9f9192bad9 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hpp @@ -1,6 +1,6 @@ #pragma once -#include "Sequence.hip.hpp" -#include "functional2.hip.hpp" +#include "Sequence.hpp" +#include "functional2.hpp" template struct Array diff --git a/src/include/ConstantMatrixDescriptor.hip.hpp b/src/include/ConstantMatrixDescriptor.hpp similarity index 98% rename from src/include/ConstantMatrixDescriptor.hip.hpp rename to src/include/ConstantMatrixDescriptor.hpp index 4b15f75fa9..8339580d01 100644 --- a/src/include/ConstantMatrixDescriptor.hip.hpp +++ b/src/include/ConstantMatrixDescriptor.hpp @@ -1,5 +1,5 @@ #pragma once -#include "common.hip.hpp" +#include "common.hpp" template struct ConstantMatrixDescriptor diff --git a/src/include/ConstantMergedTensorDescriptor.hip.hpp b/src/include/ConstantMergedTensorDescriptor.hpp similarity index 98% rename from src/include/ConstantMergedTensorDescriptor.hip.hpp rename to src/include/ConstantMergedTensorDescriptor.hpp index 2333035190..21a08a3b67 100644 --- a/src/include/ConstantMergedTensorDescriptor.hip.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hpp @@ -1,6 +1,6 @@ #pragma once -#include "common.hip.hpp" -#include "ConstantTensorDescriptor.hip.hpp" +#include "common.hpp" +#include "ConstantTensorDescriptor.hpp" // OriginalTensorDesc : ConstantTensorDescriptor<...> // it's the tensor whose dimensions are to be merged diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hpp similarity index 99% rename from src/include/ConstantTensorDescriptor.hip.hpp rename to src/include/ConstantTensorDescriptor.hpp index f28cb32733..f2decc3f54 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hpp @@ -1,5 +1,5 @@ #pragma once -#include "common.hip.hpp" +#include "common.hpp" template __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hpp similarity index 99% rename from src/include/Sequence.hip.hpp rename to src/include/Sequence.hpp index a29506f215..5d021631d6 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hpp @@ -1,6 +1,6 @@ #pragma once -#include "integral_constant.hip.hpp" -#include "functional.hip.hpp" +#include "integral_constant.hpp" +#include "functional.hpp" template struct is_valid_sequence_map; diff --git a/src/include/amd_inline_asm.hip.hpp b/src/include/amd_inline_asm.hpp similarity index 99% rename from src/include/amd_inline_asm.hip.hpp rename to src/include/amd_inline_asm.hpp index 1e453d3cf5..05e34a26be 100644 --- a/src/include/amd_inline_asm.hip.hpp +++ b/src/include/amd_inline_asm.hpp @@ -1,5 +1,5 @@ #pragma once -#include "common.hip.hpp" +#include "common.hpp" #define NO_VM_WAIT 0 #define NO_LGKM_WAIT 0 diff --git a/src/include/base.hip.hpp b/src/include/base.hpp similarity index 100% rename from src/include/base.hip.hpp rename to src/include/base.hpp diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hpp similarity index 99% rename from src/include/blockwise_2d_tensor_op.hip.hpp rename to src/include/blockwise_2d_tensor_op.hpp index d6ddf8db5f..d39a74a1a3 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hpp @@ -1,6 +1,6 @@ #pragma once -#include "common.hip.hpp" -#include "ConstantTensorDescriptor.hip.hpp" +#include "common.hpp" +#include "ConstantTensorDescriptor.hpp" template __device__ void diff --git a/src/include/blockwise_3d_tensor_op.hip.hpp b/src/include/blockwise_3d_tensor_op.hpp similarity index 99% rename from src/include/blockwise_3d_tensor_op.hip.hpp rename to src/include/blockwise_3d_tensor_op.hpp index 058a5b4401..ad647fc9da 100644 --- a/src/include/blockwise_3d_tensor_op.hip.hpp +++ b/src/include/blockwise_3d_tensor_op.hpp @@ -1,6 +1,6 @@ #pragma once -#include "common.hip.hpp" -#include "ConstantTensorDescriptor.hip.hpp" +#include "common.hpp" +#include "ConstantTensorDescriptor.hpp" template __device__ void diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hpp similarity index 99% rename from src/include/blockwise_batched_gemm.hip.hpp rename to src/include/blockwise_batched_gemm.hpp index 6e397d1efa..937bf5ee8a 100644 --- a/src/include/blockwise_batched_gemm.hip.hpp +++ b/src/include/blockwise_batched_gemm.hpp @@ -1,5 +1,5 @@ #pragma once -#include "threadwise_gemm.hip.hpp" +#include "threadwise_gemm.hpp" template diff --git a/src/include/functional.hip.hpp b/src/include/functional.hpp similarity index 95% rename from src/include/functional.hip.hpp rename to src/include/functional.hpp index 776abe0b2a..84e5cffe83 100644 --- a/src/include/functional.hip.hpp +++ b/src/include/functional.hpp @@ -1,6 +1,6 @@ #pragma once -#include "integral_constant.hip.hpp" -#include "Sequence.hip.hpp" +#include "integral_constant.hpp" +#include "Sequence.hpp" struct forwarder { diff --git a/src/include/functional2.hip.hpp b/src/include/functional2.hpp similarity index 96% rename from src/include/functional2.hip.hpp rename to src/include/functional2.hpp index 6633abd316..c1dec36575 100644 --- a/src/include/functional2.hip.hpp +++ b/src/include/functional2.hpp @@ -1,6 +1,6 @@ #pragma once -#include "functional.hip.hpp" -#include "Sequence.hip.hpp" +#include "functional.hpp" +#include "Sequence.hpp" template struct static_for_impl; diff --git a/src/include/functional3.hip.hpp b/src/include/functional3.hpp similarity index 96% rename from src/include/functional3.hip.hpp rename to src/include/functional3.hpp index 4019725c4c..ee3ab656f4 100644 --- a/src/include/functional3.hip.hpp +++ b/src/include/functional3.hpp @@ -1,8 +1,8 @@ #pragma once -#include "functional.hip.hpp" -#include "functional2.hip.hpp" -#include "Sequence.hip.hpp" -#include "Array.hip.hpp" +#include "functional.hpp" +#include "functional2.hpp" +#include "Sequence.hpp" +#include "Array.hpp" // RemainLengths: Sequence<...> template diff --git a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp similarity index 97% rename from src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp rename to src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp index 04ec8f4c62..5bc5aa39a3 100644 --- a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp @@ -1,10 +1,10 @@ #pragma once -#include "common.hip.hpp" -#include "ConstantTensorDescriptor.hip.hpp" -#include "blockwise_2d_tensor_op.hip.hpp" -#include "blockwise_4d_tensor_op.hip.hpp" -#include "threadwise_tensor_slice_op.hip.hpp" -#include "threadwise_direct_convolution.hip.hpp" +#include "common.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "blockwise_2d_tensor_op.hpp" +#include "blockwise_4d_tensor_op.hpp" +#include "threadwise_tensor_slice_op.hpp" +#include "threadwise_direct_convolution.hpp" template __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift) diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hpp similarity index 99% rename from src/include/threadwise_direct_convolution.hip.hpp rename to src/include/threadwise_direct_convolution.hpp index 3ba4a8dd4e..a518c2d182 100644 --- a/src/include/threadwise_direct_convolution.hip.hpp +++ b/src/include/threadwise_direct_convolution.hpp @@ -1,6 +1,6 @@ #pragma once -#include "ConstantTensorDescriptor.hip.hpp" -#include "threadwise_tensor_slice_op.hip.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "threadwise_tensor_slice_op.hpp" // optimized for scenario if p_in, p_wei, p_out are in register template diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hpp similarity index 98% rename from src/include/threadwise_gemm.hip.hpp rename to src/include/threadwise_gemm.hpp index 61a4e45151..d28e9e73ea 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hpp @@ -1,6 +1,6 @@ #pragma once -#include "common.hip.hpp" -#include "ConstantMatrixDescriptor.hip.hpp" +#include "common.hpp" +#include "ConstantMatrixDescriptor.hpp" template __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread) diff --git a/src/include/threadwise_generic_tensor_slice_op.hip.hpp b/src/include/threadwise_generic_tensor_slice_op.hpp similarity index 97% rename from src/include/threadwise_generic_tensor_slice_op.hip.hpp rename to src/include/threadwise_generic_tensor_slice_op.hpp index 9a7e5ae062..7ffed89f8e 100644 --- a/src/include/threadwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/threadwise_generic_tensor_slice_op.hpp @@ -1,6 +1,6 @@ #pragma once -#include "ConstantTensorDescriptor.hip.hpp" -#include "ConstantMergedTensorDescriptor.hip.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "ConstantMergedTensorDescriptor.hpp" template diff --git a/src/include/vector_type.hip.hpp b/src/include/vector_type.hpp similarity index 98% rename from src/include/vector_type.hip.hpp rename to src/include/vector_type.hpp index ef8feeadda..3f5a3a10a8 100644 --- a/src/include/vector_type.hip.hpp +++ b/src/include/vector_type.hpp @@ -1,6 +1,6 @@ #pragma once #include "config.h" -#include "integral_constant.hip.hpp" +#include "integral_constant.hpp" template struct vector_type