mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
remove .hip extension
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hip.hpp"
|
||||
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_convolution_wrapper.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp"
|
||||
#include "gridwise_convolution_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
|
||||
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
|
||||
void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp"
|
||||
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
|
||||
|
||||
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
|
||||
void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
#include <stdlib.h>
|
||||
#include "config.h"
|
||||
#include "tensor.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "conv_common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
|
||||
@@ -1 +1 @@
|
||||
driver.hip.cpp
|
||||
driver.cpp
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "Sequence.hip.hpp"
|
||||
#include "functional2.hip.hpp"
|
||||
#include "Sequence.hpp"
|
||||
#include "functional2.hpp"
|
||||
|
||||
template <class TData, index_t NSize>
|
||||
struct Array
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "common.hpp"
|
||||
|
||||
template <index_t NRow_, index_t NCol_, index_t RowStride_>
|
||||
struct ConstantMatrixDescriptor
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
// OriginalTensorDesc : ConstantTensorDescriptor<...>
|
||||
// it's the tensor whose dimensions are to be merged
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "common.hpp"
|
||||
|
||||
template <class Lengths>
|
||||
__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "integral_constant.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
#include "integral_constant.hpp"
|
||||
#include "functional.hpp"
|
||||
|
||||
template <class Seq>
|
||||
struct is_valid_sequence_map;
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "common.hpp"
|
||||
|
||||
#define NO_VM_WAIT 0
|
||||
#define NO_LGKM_WAIT 0
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
template <index_t BlockSize, class Float, class DstDesc, class F>
|
||||
__device__ void
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
template <index_t BlockSize,
|
||||
class Float,
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
|
||||
template <index_t BlockSize, class Float, class DstDesc, class F>
|
||||
__device__ void
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "threadwise_gemm.hip.hpp"
|
||||
#include "threadwise_gemm.hpp"
|
||||
|
||||
template <index_t BlockSize,
|
||||
class BlockMatrixA,
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "threadwise_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "threadwise_gemm.hpp"
|
||||
|
||||
// if following number are power of 2, index calculation shall be greatly reduced:
|
||||
// MPerThreadSubC, NPerThreadSubC, MLevel0Cluster, NLevel0Cluster, MLevel1Cluster, NLevel1Cluster
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
|
||||
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
|
||||
template <index_t BlockSize,
|
||||
class Float,
|
||||
@@ -1,13 +0,0 @@
|
||||
#pragma once
|
||||
#include "base.hip.hpp"
|
||||
#include "vector_type.hip.hpp"
|
||||
#include "integral_constant.hip.hpp"
|
||||
#include "Sequence.hip.hpp"
|
||||
#include "Array.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
#include "functional2.hip.hpp"
|
||||
#include "functional3.hip.hpp"
|
||||
|
||||
#if USE_AMD_INLINE_ASM
|
||||
#include "amd_inline_asm.hip.hpp"
|
||||
#endif
|
||||
13
src/include/common.hpp
Normal file
13
src/include/common.hpp
Normal file
@@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
#include "base.hpp"
|
||||
#include "vector_type.hpp"
|
||||
#include "integral_constant.hpp"
|
||||
#include "Sequence.hpp"
|
||||
#include "Array.hpp"
|
||||
#include "functional.hpp"
|
||||
#include "functional2.hpp"
|
||||
#include "functional3.hpp"
|
||||
|
||||
#if USE_AMD_INLINE_ASM
|
||||
#include "amd_inline_asm.hpp"
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <class InDesc, class WeiDesc>
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "integral_constant.hip.hpp"
|
||||
#include "Sequence.hip.hpp"
|
||||
#include "integral_constant.hpp"
|
||||
#include "Sequence.hpp"
|
||||
|
||||
struct forwarder
|
||||
{
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "functional.hip.hpp"
|
||||
#include "Sequence.hip.hpp"
|
||||
#include "functional.hpp"
|
||||
#include "Sequence.hpp"
|
||||
|
||||
template <class>
|
||||
struct static_for_impl;
|
||||
@@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
#include "functional.hip.hpp"
|
||||
#include "functional2.hip.hpp"
|
||||
#include "Sequence.hip.hpp"
|
||||
#include "Array.hip.hpp"
|
||||
#include "functional.hpp"
|
||||
#include "functional2.hpp"
|
||||
#include "Sequence.hpp"
|
||||
#include "Array.hpp"
|
||||
|
||||
// RemainLengths: Sequence<...>
|
||||
template <class RemainLengths>
|
||||
@@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_direct_convolution.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_direct_convolution.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,13 +1,13 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_3d_tensor_op.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_3d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,13 +1,13 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_3d_tensor_op.hip.hpp"
|
||||
#include "blockwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_3d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_batched_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_batched_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
// define B = flatten(N, Hi, Wi)
|
||||
template <index_t GridSize,
|
||||
@@ -1,11 +1,11 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
// define B = flatten(N, Hi, Wi)
|
||||
template <index_t GridSize,
|
||||
@@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
// define B = merge(N0, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
// define B = merge(N0, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -1,11 +1,11 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "threadwise_generic_tensor_slice_op.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
#include "threadwise_generic_tensor_slice_op.hpp"
|
||||
|
||||
// define B = merge(N0, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -1,11 +1,11 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "threadwise_generic_tensor_slice_op.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
#include "threadwise_generic_tensor_slice_op.hpp"
|
||||
|
||||
// define B = merge(N0, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -1,11 +1,11 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_direct_convolution.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_direct_convolution.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_direct_convolution.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "threadwise_direct_convolution.hpp"
|
||||
|
||||
template <class TInWei,
|
||||
class TOut,
|
||||
@@ -1,11 +1,11 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_2d_tensor_op.hip.hpp"
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_2d_tensor_op.hpp"
|
||||
#include "threadwise_4d_tensor_op.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
|
||||
template <index_t GridSize,
|
||||
index_t BlockSize,
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
template <class Float, class Desc, class IDim, class NShift>
|
||||
__device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "threadwise_tensor_slice_op.hpp"
|
||||
|
||||
// optimized for scenario if p_in, p_wei, p_out are in register
|
||||
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
|
||||
template <class Float, class Matrix>
|
||||
__device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
|
||||
template <class Float,
|
||||
class SrcDesc,
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
// need to assume src and dst is aligned
|
||||
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "config.h"
|
||||
#include "integral_constant.hip.hpp"
|
||||
#include "integral_constant.hpp"
|
||||
|
||||
template <class T, index_t N>
|
||||
struct vector_type
|
||||
Reference in New Issue
Block a user